diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 734f3509f3843..10579ce61118e 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -231,10 +231,8 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, { RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - (string CharClass, bool CaseInsensitive)[]? lcc = code.LeadingCharClasses; bool rtl = code.RightToLeft; bool hasTextInfo = false; - bool textInfoEmitted = false; // Emit locals initialization writer.WriteLine("string runtext = base.runtext!;"); @@ -267,34 +265,52 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, }; using (EmitBlock(writer, clause)) { - EmitAnchors(); - - if (code.BoyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm) - { - if (rbm.PatternSupportsIndexOf) - { - EmitIndexOf(rbm.Pattern); - } - else - { - EmitBoyerMoore(rbm); - } - } - else if (lcc is not null) + // Emit any anchors. + if (!EmitAnchors()) { - if (rtl) - { - EmitLeadingCharacter_RightToLeft(); - } - else + // Either anchors weren't specified, or they don't completely root all matches to a specific location. + + // If whatever search operation we need to perform entails case-insensitive operations + // that weren't already handled via creation of sets, we need to get an store the + // TextInfo object to use (unless RegexOptions.CultureInvariant was specified). + EmitTextInfo(writer, ref hasTextInfo, rm); + + // Emit the code for whatever find mode has been determined. + switch (code.FindOptimizations.FindMode) { - EmitLeadingCharacter_LeftToRight(); + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_RightToLeft(code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_LeftToRight(); + break; + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_RightToLeft(); + break; + + default: + Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); + goto case FindNextStartingPositionMode.NoSearch; + + case FindNextStartingPositionMode.NoSearch: + writer.WriteLine("return true;"); + break; } } - else - { - writer.WriteLine("return true;"); - } } writer.WriteLine(); @@ -303,15 +319,15 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, writer.WriteLine(!rm.Code.RightToLeft ? "base.runtextpos = runtextend;" : "base.runtextpos = runtextbeg;"); writer.WriteLine("return false;"); - void EmitAnchors() + // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further + // searching is required; otherwise, false. + bool EmitAnchors() { // Generate anchor checks. - if ((code.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) + if ((code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) { - // TODO: RegexInterpreter also factors in a Boyer-Moore prefix check in places Compiled just returns true. - // Determine if we should do so here and in Compiled as well, and potentially update RegexInterpreter. - // Interpreted and Compiled also differ in various places as to whether they update positions, as do LTR vs RTL. Determine why. - switch (code.LeadingAnchor) + // TODO: Interpreted and Compiled differ in various places as to whether they update positions, as do LTR vs RTL. Determine why. + switch (code.FindOptimizations.LeadingAnchor) { case RegexPrefixAnalyzer.Beginning: writer.WriteLine("// Beginning \\A anchor"); @@ -331,7 +347,7 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; case RegexPrefixAnalyzer.Start: writer.WriteLine("// Start \\G anchor"); @@ -351,7 +367,7 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; case RegexPrefixAnalyzer.EndZ: // TODO: Why are the LTR and RTL cases inconsistent here with RegexOptions.Compiled? @@ -372,9 +388,9 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; - case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check + case RegexPrefixAnalyzer.End: writer.WriteLine("// End \\z anchor"); if (!rtl) { @@ -391,14 +407,14 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; - case RegexPrefixAnalyzer.Bol when !rtl: // Don't bother optimizing for the niche case of RegexOptions.RightToLeft | RegexOptions.Multiline + case RegexPrefixAnalyzer.Bol: // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any Boyer-Moore or - // leading char class searches. + // to boost our position to the next line, and then continue normally with any searches. + Debug.Assert(!rtl, "RightToLeft isn't implemented and should have been filtered out previously"); writer.WriteLine("// Beginning-of-line anchor"); using (EmitBlock(writer, "if (runtextpos > runtextbeg && runtext[runtextpos - 1] != '\\n')")) { @@ -413,131 +429,12 @@ void EmitAnchors() break; } } - } - - void EmitBoyerMoore(RegexBoyerMoore rbm) - { - EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm); - - int beforefirst; - int last; - if (!rtl) - { - //limitLocal = "runtextend"; - beforefirst = -1; - last = rbm.Pattern.Length - 1; - } - else - { - //limitLocal = "runtextbeg"; - beforefirst = rbm.Pattern.Length; - last = 0; - } - - int chLast = rbm.Pattern[last]; - - EmitAdd(writer, "runtextpos", !rtl ? rbm.Pattern.Length - 1 : -rbm.Pattern.Length); - - using (EmitBlock(writer, $"while ({(!rtl ? "runtextpos < runtextend" : "runtextpos >= runtextbeg")})")) - { - writer.WriteLine($"ch = {ToLowerIfNeeded(hasTextInfo, options, "runtext[runtextpos]", rbm.CaseInsensitive)};"); - - using (EmitBlock(writer, $"if (ch != {Literal((char)chLast)})")) - { - writer.WriteLine($"ch -= {Literal((char)rbm.LowASCII)};"); - using (EmitBlock(writer, $"if ((uint)ch > ({Literal((char)rbm.HighASCII)} - {Literal((char)rbm.LowASCII)}))")) - { - EmitAdd(writer, "runtextpos", (!rtl ? rbm.Pattern.Length : -rbm.Pattern.Length)); - writer.WriteLine("continue;"); - } - - int negativeRange = rbm.HighASCII - rbm.LowASCII + 1; - if (negativeRange > 1) // High > Low - { - // Create a string to store the lookup table we use to find the offset. - // Store the offsets into the string. RightToLeft has negative offsets, so to support it with chars (unsigned), we negate - // the values to be stored in the string, and then at run time after looking up the offset in the string, negate it again. - Debug.Assert(rbm.Pattern.Length <= char.MaxValue, "RegexBoyerMoore should have limited the size allowed."); - Span span = new char[negativeRange]; - for (int i = 0; i < span.Length; i++) - { - int offset = rbm.NegativeASCII[i + rbm.LowASCII]; - if (offset == beforefirst) - { - offset = rbm.Pattern.Length; - } - else if (rtl) - { - offset = -offset; - } - Debug.Assert(offset >= 0 && offset <= char.MaxValue); - span[i] = (char)offset; - } - - writer.WriteLine($"runtextpos {(rtl ? "-=" : "+=")} {Literal(span.ToString())}[ch];"); - } - else - { - Debug.Assert(negativeRange == 1); // High == Low - int offset = rbm.NegativeASCII[rbm.LowASCII]; - if (offset == beforefirst) - { - offset = rtl ? -rbm.Pattern.Length : rbm.Pattern.Length; - } - EmitAdd(writer, "runtextpos", offset); - } - writer.WriteLine("continue;"); - } - writer.WriteLine(); - writer.WriteLine("int test = runtextpos;"); - writer.WriteLine(); - - for (int i = rbm.Pattern.Length - 2; i >= 0; i--) - { - int charIndex = !rtl ? i : rbm.Pattern.Length - 1 - i; - bool sameAsPrev = i < rbm.Pattern.Length - 2 && rbm.Positive[charIndex] == rbm.Positive[!rtl ? i + 1 : rbm.Pattern.Length - 1 - (i + 1)]; - bool sameAsNext = i > 0 && rbm.Positive[charIndex] == rbm.Positive[!rtl ? i - 1 : rbm.Pattern.Length - 1 - (i - 1)]; - - string condition = $"{ToLowerIfNeeded(hasTextInfo, options, (!rtl ? "runtext[--test]" : "runtext[++test]"), rbm.CaseInsensitive && RegexCharClass.ParticipatesInCaseConversion(rbm.Pattern[charIndex]))} != {Literal(rbm.Pattern[charIndex])}"; - switch ((sameAsPrev, sameAsNext)) - { - case (true, true): - writer.WriteLine($" {condition} ||"); - break; - - case (false, true): - writer.WriteLine($"if ({condition} ||"); - break; - - case (true, false): - writer.WriteLine($" {condition})"); - using (EmitBlock(writer, null)) - { - EmitAdd(writer, "runtextpos", rbm.Positive[charIndex]); - writer.WriteLine("continue;"); - } - writer.WriteLine(); - break; - case (false, false): - using (EmitBlock(writer, $"if ({condition})")) - { - EmitAdd(writer, "runtextpos", rbm.Positive[charIndex]); - writer.WriteLine("continue;"); - } - writer.WriteLine(); - break; - } - } - - writer.WriteLine(!rtl ? - "base.runtextpos = test;" : - "base.runtextpos = test + 1;"); - writer.WriteLine("return true;"); - } + return false; } - void EmitIndexOf(string prefix) + // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern. + void EmitIndexOf_LeftToRight(string prefix) { writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos), {Literal(prefix)});"); writer.WriteLine("if (i >= 0)"); @@ -547,87 +444,89 @@ void EmitIndexOf(string prefix) writer.WriteLine("}"); } - void EmitLeadingCharacter_RightToLeft() + // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. + void EmitIndexOf_RightToLeft(string prefix) { - EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm); + writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextbeg, runtextpos - runtextbeg), {Literal(prefix)});"); + writer.WriteLine("if (i >= 0)"); + writer.WriteLine("{"); + writer.WriteLine($" base.runtextpos = runtextbeg + i + {prefix.Length};"); + writer.WriteLine(" return true;"); + writer.WriteLine("}"); + } - Debug.Assert(lcc.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft"); - string set = lcc[0].CharClass; - if (RegexCharClass.IsSingleton(set)) + // Emits a right-to-left search for a set at a fixed position from the start of the pattern. + // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) + void EmitFixedSet_RightToLeft() + { + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = code.FindOptimizations.FixedDistanceSets![0]; + Debug.Assert(set.Distance == 0); + + if (set.Chars is { Length: 1 } && !set.CaseInsensitive) { - char ch = RegexCharClass.SingletonChar(set); - using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)")) - { - using (EmitBlock(writer, $"if (runtext[i] == {ToLowerIfNeeded(hasTextInfo, options, Literal(ch), lcc[0].CaseInsensitive)})")) - { - writer.WriteLine("base.runtextpos = i + 1;"); - writer.WriteLine("return true;"); - } - } + writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextbeg, runtextpos - runtextbeg), {Literal(set.Chars[0])});"); + writer.WriteLine("if (i >= 0)"); + writer.WriteLine("{"); + writer.WriteLine(" base.runtextpos = runtextbeg + i + 1;"); + writer.WriteLine(" return true;"); + writer.WriteLine("}"); } else { using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)")) { - using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtext[i]", set, lcc[0].CaseInsensitive)})")) + using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtext[i]", set.Set, set.CaseInsensitive)})")) { - writer.WriteLine("runtextpos = i + 1;"); + writer.WriteLine("base.runtextpos = i + 1;"); writer.WriteLine("return true;"); } } } } - void EmitLeadingCharacter_LeftToRight() + // Emits a left-to-right search for a set at a fixed position from the start of the pattern, + // and potentially other sets at other fixed positions in the pattern. + void EmitFixedSet_LeftToRight() { - Debug.Assert(lcc is not null && lcc.Length > 0); - - // If minRequiredLength > 0, we already output a more stringent check. In the rare case - // where we were unable to get an accurate enough min required length to ensure it's larger - // than the prefixes we calculated, we also need to ensure we have enough space for those, - // as they also represent a min required length. - if (minRequiredLength < lcc.Length) - { - writer.WriteLine($"// Validate at least {lcc.Length} characters are available to match"); - string endExpr = lcc.Length > 1 ? $"runtextend - {lcc.Length - 1}" : "runtextend"; - using (EmitBlock(writer, $"if (runtextpos >= {endExpr})")) - { - writer.WriteLine("goto ReturnFalse;"); - } - writer.WriteLine(); - } - - writer.WriteLine("global::System.ReadOnlySpan span = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);"); + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets; + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; + const int MaxSets = 4; + int setsToUse = Math.Min(sets.Count, MaxSets); // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. - Span setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below - int setCharsCount = 0, charClassIndex = 0; - bool canUseIndexOf = - !lcc[0].CaseInsensitive && - (setCharsCount = RegexCharClass.GetSetChars(lcc[0].CharClass, setChars)) > 0 && - !RegexCharClass.IsNegated(lcc[0].CharClass); - bool needLoop = !canUseIndexOf || lcc.Length > 1; + int setIndex = 0; + bool canUseIndexOf = !primarySet.CaseInsensitive && primarySet.Chars is not null; + bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitScope loopBlock = default; if (needLoop) { - EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm); - writer.WriteLine(); - string upperBound = lcc.Length > 1 ? $"span.Length - {lcc.Length - 1}" : "span.Length"; + writer.WriteLine("global::System.ReadOnlySpan span = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);"); + string upperBound = "span.Length" + (setsToUse > 1 || primarySet.Distance != 0 ? $" - {minRequiredLength - 1}" : ""); loopBlock = EmitBlock(writer, $"for (int i = 0; i < {upperBound}; i++)"); } if (canUseIndexOf) { - charClassIndex = 1; + string span = needLoop ? + "span" : + "global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos)"; - string span = needLoop ? "span.Slice(i)" : "span"; - string indexOf = setCharsCount switch + span = (needLoop, primarySet.Distance) switch { - 1 => $"global::System.MemoryExtensions.IndexOf({span}, {Literal(setChars[0])})", - 2 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(setChars[0])}, {Literal(setChars[1])})", - _ => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})", + (false, 0) => span, + (true, 0) => $"{span}.Slice(i)", + (false, _) => $"{span}.Slice({primarySet.Distance})", + (true, _) => $"{span}.Slice(i + {primarySet.Distance})", + }; + + string indexOf = primarySet.Chars!.Length switch + { + 1 => $"global::System.MemoryExtensions.IndexOf({span}, {Literal(primarySet.Chars[0])})", + 2 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", + 3 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", + _ => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(new string(primarySet.Chars))})", }; if (needLoop) @@ -640,60 +539,93 @@ void EmitLeadingCharacter_LeftToRight() writer.WriteLine("i += indexOfPos;"); writer.WriteLine(); - if (lcc.Length > 1) + if (setsToUse > 1) { - using (EmitBlock(writer, $"if (i >= span.Length - {lcc.Length - 1})")) + using (EmitBlock(writer, $"if (i >= span.Length - {minRequiredLength - 1})")) { writer.WriteLine("goto ReturnFalse;"); } + writer.WriteLine(); } } else { writer.WriteLine($"int i = {indexOf};"); - using (EmitBlock(writer, "if (i < 0)")) + using (EmitBlock(writer, "if (i >= 0)")) { - writer.WriteLine("goto ReturnFalse;"); + writer.WriteLine("base.runtextpos = runtextpos + i;"); + writer.WriteLine("return true;"); } } - writer.WriteLine(); + + setIndex = 1; } - Debug.Assert(charClassIndex == 0 || charClassIndex == 1); - bool hasCharClassConditions = false; - if (charClassIndex < lcc.Length) + if (needLoop) { - // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") && - // ...) - Debug.Assert(needLoop); - int start = charClassIndex; - for (; charClassIndex < lcc.Length; charClassIndex++) + Debug.Assert(setIndex == 0 || setIndex == 1); + bool hasCharClassConditions = false; + if (setIndex < setsToUse) { - string spanIndex = charClassIndex > 0 ? $"span[i + {charClassIndex}]" : "span[i]"; - string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, lcc[charClassIndex].CharClass, lcc[charClassIndex].CaseInsensitive); - - if (charClassIndex == start) + // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") && + // ...) + Debug.Assert(needLoop); + int start = setIndex; + for (; setIndex < setsToUse; setIndex++) { - writer.Write($"if ({charInClassExpr}"); - } - else - { - writer.WriteLine(" &&"); - writer.Write($" {charInClassExpr}"); + string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]"; + string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive); + + if (setIndex == start) + { + writer.Write($"if ({charInClassExpr}"); + } + else + { + writer.WriteLine(" &&"); + writer.Write($" {charInClassExpr}"); + } } + writer.WriteLine(")"); + hasCharClassConditions = true; } - writer.WriteLine(")"); - hasCharClassConditions = true; - } - using (hasCharClassConditions ? EmitBlock(writer, null) : default) - { - writer.WriteLine("base.runtextpos = runtextpos + i;"); - writer.WriteLine("return true;"); + using (hasCharClassConditions ? EmitBlock(writer, null) : default) + { + writer.WriteLine("base.runtextpos = runtextpos + i;"); + writer.WriteLine("return true;"); + } } loopBlock.Dispose(); } + + // If a TextInfo is needed to perform ToLower operations, emits a local initialized to the TextInfo to use. + static void EmitTextInfo(IndentedTextWriter writer, ref bool hasTextInfo, RegexMethod rm) + { + // Emit local to store current culture if needed + if ((rm.Options & RegexOptions.CultureInvariant) == 0) + { + bool needsCulture = rm.Code.FindOptimizations.FindMode switch + { + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or + FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true, + + _ when rm.Code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), + + _ => false, + }; + + if (needsCulture) + { + hasTextInfo = true; + writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;"); + } + } + } } /// Emits the body of the Go override. @@ -750,9 +682,12 @@ private static void EmitNonBacktrackingGo(IndentedTextWriter writer, RegexMethod /// Emits the body of a simplified Go implementation that's possible when there's minimal backtracking required by the expression. private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, string id) { + // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated + // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. + const int MaxUnrollSize = 16; + RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - (string CharClass, bool CaseInsensitive)[]? lcc = code.LeadingCharClasses; bool rtl = code.RightToLeft; bool hasTimeout = false; @@ -1267,26 +1202,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Concatenate: - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) - { - EmitSpanLengthCheck(requiredLength); - writer.WriteLine(); - - for (; i < exclusiveEnd; i++) - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); - } - - i--; - } - else - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); - } - } + EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; case RegexNode.Capture: @@ -1360,8 +1276,91 @@ void EmitUpdateBumpalong() writer.WriteLine("base.runtextpos = runtextpos;"); } + void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) + { + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + { + bool wroteClauses = true; + writer.Write($"if ({SpanLengthCheck(requiredLength)}"); + + while (i < exclusiveEnd) + { + for (; i < exclusiveEnd; i++) + { + void WriteSingleCharChild(RegexNode child) + { + if (wroteClauses) + { + writer.WriteLine(" ||"); + writer.Write(" "); + } + else + { + writer.Write("if ("); + } + EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true); + wroteClauses = true; + } + + RegexNode child = node.Child(i); + if (child.Type is RegexNode.One or RegexNode.Notone or RegexNode.Set) + { + WriteSingleCharChild(child); + writer.Write($" /* {DescribeNode(child)} */"); + } + else if (child.Type is RegexNode.Oneloop or RegexNode.Onelazy or RegexNode.Oneloopatomic or + RegexNode.Setloop or RegexNode.Setlazy or RegexNode.Setloopatomic or + RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic && + child.M == child.N && + child.M <= MaxUnrollSize) + { + for (int c = 0; c < child.M; c++) + { + WriteSingleCharChild(child); + if (c == 0) + { + writer.Write($" /* {DescribeNode(child)} */"); + } + } + } + else + { + break; + } + } + + if (wroteClauses) + { + writer.WriteLine(")"); + using (EmitBlock(writer, null)) + { + writer.WriteLine($"goto {doneLabel};"); + } + wroteClauses = false; + } + + if (i < exclusiveEnd) + { + writer.WriteLine(); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); + i++; + } + } + + i--; + } + else + { + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); + } + } + } + // Emits the code to handle a single-character match. - void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null) + void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null, bool clauseOnly = false) { // This only emits a single check, but it's called from the looping constructs in a loop // to generate the code for a single check, so we map those looping constructs to the @@ -1375,13 +1374,20 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); expr = $"{expr} {(node.IsOneFamily ? "!=" : "==")} {Literal(node.Ch)}"; } - using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) + if (clauseOnly) { - writer.WriteLine($"goto {doneLabel};"); + writer.Write(expr); + } + else + { + using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) + { + writer.WriteLine($"goto {doneLabel};"); + } } textSpanPos++; @@ -1685,10 +1691,6 @@ void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true) EmitSpanLengthCheck(iterations); } - // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated - // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. - const int MaxUnrollSize = 16; - if (iterations <= MaxUnrollSize) { // if (textSpan[textSpanPos] != c1 || @@ -1771,13 +1773,13 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = int minIterations = node.M; int maxIterations = node.N; - Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny + Span setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today int numSetChars = 0; string iterationLocal = NextLocalName("i"); if (node.IsNotoneFamily && maxIterations == int.MaxValue && - (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) + (!IsCaseInsensitive(node))) { // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, @@ -1802,21 +1804,25 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0 && RegexCharClass.IsNegated(node.Str!)) { - // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would + // If the set is negated and contains only a few characters (if it contained 1 and was negated, it should // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. + Debug.Assert(numSetChars > 1); writer.Write($"int {iterationLocal} = global::System.MemoryExtensions.IndexOfAny({textSpanLocal}"); if (textSpanPos != 0) { writer.Write($".Slice({textSpanPos})"); } - writer.WriteLine(numSetChars == 2 ? - $", {Literal(setChars[0])}, {Literal(setChars[1])});" : - $", {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});"); + writer.WriteLine(numSetChars switch + { + 2 => $", {Literal(setChars[0])}, {Literal(setChars[1])});", + 3 => $", {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});", + _ => $", {Literal(setChars.Slice(0, numSetChars).ToString())});", + }); using (EmitBlock(writer, $"if ({iterationLocal} == -1)")) { writer.WriteLine(textSpanPos > 0 ? @@ -1844,7 +1850,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } @@ -1895,7 +1901,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } @@ -2538,7 +2544,7 @@ void EmitOneCode(string? label) clause += Code() == RegexCode.Set ? $"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive())}" : - $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}"; + $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive())} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}"; using (EmitBlock(writer, $"if ({clause})")) { @@ -2555,7 +2561,7 @@ void EmitOneCode(string? label) writer.WriteLine($"if (runtextend - runtextpos < {str.Length} ||"); for (int i = 0; i < str.Length; i++) { - writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))} != {Literal(str[i])}"); + writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive())} != {Literal(str[i])}"); writer.WriteLine(i < str.Length - 1 ? " ||" : ")"); } using (EmitBlock(writer, null)) @@ -2575,7 +2581,7 @@ void EmitOneCode(string? label) for (int i = str.Length; i > 0;) { i--; - writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))} != {Literal(str[i])}"); + writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive())} != {Literal(str[i])}"); writer.WriteLine(i == 0 ? ")" : " ||"); } using (EmitBlock(writer, null)) @@ -2661,7 +2667,7 @@ void EmitOneCode(string? label) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); expr = $"{expr} {(Code() == RegexCode.Onerep ? "!=" : "==")} {Literal((char)Operand(0))}"; } @@ -2708,7 +2714,7 @@ void EmitOneCode(string? label) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); string op = Code() == RegexCode.Onerep ? "!=" : "=="; using (EmitBlock(writer, $"if ({expr} {op} {Literal((char)Operand(0))})")) { @@ -2769,14 +2775,14 @@ void EmitOneCode(string? label) } string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? rm.Code.Strings[Operand(0)] : null; - Span setChars = stackalloc char[3]; + Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars; // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive, // we can use the vectorized IndexOf to search for the target character. if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) && !IsRightToLeft() && - (!IsCaseInsensitive() || !RegexCharClass.ParticipatesInCaseConversion(Operand(0)))) + !IsCaseInsensitive()) { writer.WriteLine($"{I} = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal((char)Operand(0))}); // i"); using (EmitBlock(writer, $"if ({I} == -1)")) @@ -2793,20 +2799,19 @@ void EmitOneCode(string? label) else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && !IsRightToLeft() && !IsCaseInsensitive() && - (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 && RegexCharClass.IsNegated(set!)) { // Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive, - // and if the set contains only 2 or 3 negated chars, we can use the vectorized IndexOfAny + // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny // to search for those chars. - - Debug.Assert(numSetChars is 2 or 3); - writer.Write($"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}"); - if (numSetChars == 3) + Debug.Assert(numSetChars > 1); + writer.WriteLine(numSetChars switch { - writer.Write($", {Literal(setChars[2])}"); - } - writer.WriteLine("); // i"); + 2 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}); // i", + 3 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])}); // i", + _ => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars.Slice(0, numSetChars).ToString())}); // i", + }); using (EmitBlock(writer, $"if ({I} == -1)")) { writer.WriteLine($"runtextpos += {Len};"); @@ -2846,7 +2851,7 @@ void EmitOneCode(string? label) else { string op = Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic ? "!=" : "=="; - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); expr = $"{expr} {op} {Literal((char)Operand(0))}"; } @@ -2975,7 +2980,7 @@ void EmitOneCode(string? label) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); expr = $"{expr} {(Code() == RegexCode.Onelazy ? "!=" : "==")} {Literal((char)Operand(0))}"; } @@ -3223,40 +3228,6 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression; - private static void EmitTextInfoIfRequired(IndentedTextWriter writer, ref bool textInfoEmitted, ref bool hasTextInfo, RegexMethod rm) - { - if (textInfoEmitted) - { - return; - } - textInfoEmitted = true; - - // Emit local to store current culture if needed - if ((((RegexOptions)rm.Options) & RegexOptions.CultureInvariant) == 0) - { - bool needsCulture = (((RegexOptions)rm.Options) & RegexOptions.IgnoreCase) != 0 || rm.Code.BoyerMoorePrefix?.CaseInsensitive == true; - if (!needsCulture && rm.Code.LeadingCharClasses is not null) - { - for (int i = 0; i < rm.Code.LeadingCharClasses.Length; i++) - { - if (rm.Code.LeadingCharClasses[i].CaseInsensitive) - { - needsCulture = true; - break; - } - } - } - - if (needsCulture) - { - hasTextInfo = true; - writer.WriteLine("// IgnoreCase with CultureInfo.CurrentCulture"); - writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;"); - writer.WriteLine(); - } - } - } - private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive) { // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass), @@ -3319,22 +3290,32 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options return $"(char.GetUnicodeCategory({chExpr}) {(negated ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})"; } - // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), - // it's cheaper and smaller to compare against each than it is to use a lookup table. - if (!invariant) + // Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes), + // it may be cheaper and smaller to compare against each than it is to use a lookup table. We can also special-case + // the very common case with case insensitivity of two characters next to each other being the upper and lowercase + // ASCII variants of each other, in which case we can use bit manipulation to avoid a comparison. + if (!invariant && !RegexCharClass.IsNegated(charClass)) { - Span setChars = stackalloc char[3]; - int numChars = RegexCharClass.GetSetChars(charClass, setChars); - if (!RegexCharClass.IsNegated(charClass)) + Span setChars = stackalloc char[4]; + switch (RegexCharClass.GetSetChars(charClass, setChars)) { - switch (numChars) - { - case 2: - return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; + case 2: + return (setChars[0] | 0x20) == setChars[1] ? + $"(({chExpr} | 0x20) == {Literal(setChars[1])})" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; - case 3: - return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))"; - } + case 3: + return (setChars[0] | 0x20) == setChars[1] ? + $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))"; + + case 4: + if (((setChars[0] | 0x20) == setChars[1]) && + ((setChars[2] | 0x20) == setChars[3])) + { + return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))"; + } + break; } } diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs index 045dbd5467968..cb3bed4d27fa2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs @@ -178,7 +178,7 @@ private static bool IsSyntaxTargetForGeneration(SyntaxNode node) => RegexCode code; try { - code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture)); + code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture), culture); } catch (Exception e) { diff --git a/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs b/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs index aefab1dc1b057..13626a4be5a3b 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs @@ -54,13 +54,13 @@ namespace System.Threading { internal static class InterlockedExtensions { - public static int Or(ref int location1, int value) + public static uint Or(ref uint location1, uint value) { - int current = location1; + uint current = location1; while (true) { - int newValue = current | value; - int oldValue = Interlocked.CompareExchange(ref location1, newValue, current); + uint newValue = current | value; + uint oldValue = (uint)Interlocked.CompareExchange(ref Unsafe.As(ref location1), (int)newValue, (int)current); if (oldValue == current) { return oldValue; diff --git a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj index 7f59e37493cd8..8e1ec70d99d6c 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj +++ b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj @@ -30,10 +30,10 @@ - + diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index da0f0edd7c0fa..8537fd70de527 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -23,11 +23,11 @@ - + @@ -100,6 +100,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs index edcb23b1cdba9..66b1a8108c4ab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs @@ -128,7 +128,7 @@ public static Regex GetOrAdd(string pattern, RegexOptions options, TimeSpan matc Regex.ValidateOptions(options); Regex.ValidateMatchTimeout(matchTimeout); - CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + CultureInfo culture = RegexParser.GetTargetCulture(options); Key key = new Key(pattern, culture.ToString(), options, matchTimeout); Regex? regex = Get(key); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 85d575ccaf193..ee276b33deb75 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -65,12 +65,12 @@ internal Regex(string pattern, CultureInfo? culture) // Call Init directly rather than delegating to a Regex ctor that takes // options to enable linking / tree shaking to remove the Regex compiler // and NonBacktracking implementation if it's not used. - Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture); + Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture ?? CultureInfo.CurrentCulture); } internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture) { - culture ??= GetTargetCulture(options); + culture ??= RegexParser.GetTargetCulture(options); Init(pattern, options, matchTimeout, culture); if ((options & RegexOptions.NonBacktracking) != 0) @@ -87,10 +87,6 @@ internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, Cult } } - /// Gets the culture to use based on the specified options. - private static CultureInfo GetTargetCulture(RegexOptions options) => - (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - /// Initializes the instance. /// /// This is separated out of the constructor so that an app only using 'new Regex(pattern)' @@ -98,7 +94,7 @@ private static CultureInfo GetTargetCulture(RegexOptions options) => /// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used. /// [MemberNotNull(nameof(_code))] - private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture) + private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { ValidatePattern(pattern); ValidateOptions(options); @@ -107,7 +103,6 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C this.pattern = pattern; internalMatchTimeout = matchTimeout; roptions = options; - culture ??= GetTargetCulture(options); #if DEBUG if (IsDebug) @@ -121,7 +116,7 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C // Generate the RegexCode from the node tree. This is required for interpreting, // and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking. - _code = RegexWriter.Write(tree); + _code = RegexWriter.Write(tree, culture); if ((options & RegexOptions.NonBacktracking) != 0) { @@ -434,7 +429,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall /// Creates a new runner instance. private RegexRunner CreateRunner() => factory?.CreateInstance() ?? - new RegexInterpreter(_code!, GetTargetCulture(roptions)); + new RegexInterpreter(_code!, RegexParser.GetTargetCulture(roptions)); /// True if the option was set. protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs deleted file mode 100644 index 7fc3fb1edf3ba..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs +++ /dev/null @@ -1,404 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -// The RegexBoyerMoore object precomputes the Boyer-Moore -// tables for fast string scanning. These tables allow -// you to scan for the first occurrence of a string within -// a large body of text without examining every character. -// The performance of the heuristic depends on the actual -// string and the text being searched, but usually, the longer -// the string that is being searched for, the fewer characters -// need to be examined. - -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.Globalization; - -namespace System.Text.RegularExpressions -{ - internal sealed class RegexBoyerMoore - { - public readonly int[] Positive; - public readonly int[] NegativeASCII; - public readonly int[][]? NegativeUnicode; - public readonly string Pattern; - public readonly int LowASCII; - public readonly int HighASCII; - public readonly bool RightToLeft; - public readonly bool CaseInsensitive; - private readonly CultureInfo _culture; - - /// The maximum prefix string length for which we'll attempt to create a Boyer-Moore table. - /// This is limited in order to minimize the overhead of constructing a Regex. - public const int MaxLimit = 50_000; // must be <= char.MaxValue for RegexCompiler to compile Boyer-Moore correctly - - /// - /// Constructs a Boyer-Moore state machine for searching for the string - /// pattern. The string must not be zero-length. - /// - public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) - { - // Sorry, you just can't use Boyer-Moore to find an empty pattern. - // We're doing this for your own protection. (Really, for speed.) - Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); - Debug.Assert(pattern.Length <= MaxLimit, "RegexBoyerMoore can take a long time for large patterns"); -#if DEBUG - if (caseInsensitive) - { - foreach (char c in pattern) - { - // We expect each individual character to have been lower-cased. We don't validate the whole - // string at once because the rest of the library doesn't currently recognize/support surrogate pairs. - Debug.Assert(c == culture.TextInfo.ToLower(c), "Pattern wasn't lowercased with provided culture"); - } - } -#endif - - Pattern = pattern; - RightToLeft = rightToLeft; - CaseInsensitive = caseInsensitive; - _culture = culture; - - int beforefirst; - int last; - int bump; - - if (!rightToLeft) - { - beforefirst = -1; - last = pattern.Length - 1; - bump = 1; - } - else - { - beforefirst = pattern.Length; - last = 0; - bump = -1; - } - - // PART I - the good-suffix shift table - // - // compute the positive requirement: - // if char "i" is the first one from the right that doesn't match, - // then we know the matcher can advance by _positive[i]. - // - // This algorithm is a simplified variant of the standard - // Boyer-Moore good suffix calculation. - - Positive = new int[pattern.Length]; - - int examine = last; - char ch = pattern[examine]; - Positive[examine] = bump; - examine -= bump; - int scan; - int match; - - while (true) - { - // find an internal char (examine) that matches the tail - - while (true) - { - if (examine == beforefirst) - goto OuterloopBreak; - if (pattern[examine] == ch) - break; - examine -= bump; - } - - match = last; - scan = examine; - - // find the length of the match - - while (true) - { - if (scan == beforefirst || pattern[match] != pattern[scan]) - { - // at the end of the match, note the difference in _positive - // this is not the length of the match, but the distance from the internal match - // to the tail suffix. - if (Positive[match] == 0) - Positive[match] = match - scan; - - break; - } - - scan -= bump; - match -= bump; - } - - examine -= bump; - } - - OuterloopBreak: - - match = last - bump; - - // scan for the chars for which there are no shifts that yield a different candidate - - - // The inside of the if statement used to say - // "_positive[match] = last - beforefirst;" - // This is slightly less aggressive in how much we skip, but at worst it - // should mean a little more work rather than skipping a potential match. - while (match != beforefirst) - { - if (Positive[match] == 0) - Positive[match] = bump; - - match -= bump; - } - - // PART II - the bad-character shift table - // - // compute the negative requirement: - // if char "ch" is the reject character when testing position "i", - // we can slide up by _negative[ch]; - // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) - // - // the lookup table is divided into ASCII and Unicode portions; - // only those parts of the Unicode 16-bit code set that actually - // appear in the string are in the table. (Maximum size with - // Unicode is 65K; ASCII only case is 512 bytes.) - - NegativeASCII = new int[128]; - - for (int i = 0; i < 128; i++) - NegativeASCII[i] = last - beforefirst; - - LowASCII = 127; - HighASCII = 0; - - for (examine = last; examine != beforefirst; examine -= bump) - { - ch = pattern[examine]; - - if (ch < 128) - { - if (LowASCII > ch) - LowASCII = ch; - - if (HighASCII < ch) - HighASCII = ch; - - if (NegativeASCII[ch] == last - beforefirst) - NegativeASCII[ch] = last - examine; - } - else - { - int i = ch >> 8; - int j = ch & 0xFF; - - if (NegativeUnicode == null) - { - NegativeUnicode = new int[256][]; - } - - if (NegativeUnicode[i] == null) - { - int[] newarray = new int[256]; - - for (int k = 0; k < newarray.Length; k++) - newarray[k] = last - beforefirst; - - if (i == 0) - { - Array.Copy(NegativeASCII, newarray, 128); - NegativeASCII = newarray; - } - - NegativeUnicode[i] = newarray; - } - - if (NegativeUnicode[i][j] == last - beforefirst) - NegativeUnicode[i][j] = last - examine; - } - } - } - - // TODO: We should be able to avoid producing the RegexBoyerMoore instance - // entirely if we're going to go down the code path of using IndexOf. That will - // require some refactoring, though. - - /// Gets whether IndexOf could be used to perform the match. - public bool PatternSupportsIndexOf => - !RightToLeft && (!CaseInsensitive || !RegexCharClass.ParticipatesInCaseConversion(Pattern)); - - /// - /// When a regex is anchored, we can do a quick IsMatch test instead of a Scan - /// - public bool IsMatch(string text, int index, int beglimit, int endlimit) - { - if (!RightToLeft) - { - if (index < beglimit || endlimit - index < Pattern.Length) - return false; - } - else - { - if (index > endlimit || index - beglimit < Pattern.Length) - return false; - - index -= Pattern.Length; - } - - if (CaseInsensitive) - { - TextInfo textinfo = _culture.TextInfo; - - for (int i = 0; i < Pattern.Length; i++) - { - if (Pattern[i] != textinfo.ToLower(text[index + i])) - { - return false; - } - } - - return true; - } - - return Pattern.AsSpan().SequenceEqual(text.AsSpan(index, Pattern.Length)); - } - - /// - /// Scan uses the Boyer-Moore algorithm to find the first occurrence - /// of the specified string within text, beginning at index, and - /// constrained within beglimit and endlimit. - /// - /// The direction and case-sensitivity of the match is determined - /// by the arguments to the RegexBoyerMoore constructor. - /// - public int Scan(string text, int index, int beglimit, int endlimit) - { - int defadv; - int test; - int startmatch; - int endmatch; - int bump; - - if (!RightToLeft) - { - defadv = Pattern.Length; - startmatch = Pattern.Length - 1; - endmatch = 0; - test = index + defadv - 1; - bump = 1; - } - else - { - defadv = -Pattern.Length; - startmatch = 0; - endmatch = -defadv - 1; - test = index + defadv; - bump = -1; - } - - char chMatch = Pattern[startmatch]; - char chTest; - int test2; - int match; - int advance; - int[] unicodeLookup; - - while (true) - { - if (test >= endlimit || test < beglimit) - return -1; - - chTest = text[test]; - - if (CaseInsensitive) - chTest = _culture.TextInfo.ToLower(chTest); - - if (chTest != chMatch) - { - if (chTest < 128) - advance = NegativeASCII[chTest]; - else if (null != NegativeUnicode && (null != (unicodeLookup = NegativeUnicode[chTest >> 8]))) - advance = unicodeLookup[chTest & 0xFF]; - else - advance = defadv; - - test += advance; - } - else - { // if (chTest == chMatch) - test2 = test; - match = startmatch; - - while (true) - { - if (match == endmatch) - return (RightToLeft ? test2 + 1 : test2); - - match -= bump; - test2 -= bump; - - chTest = text[test2]; - - if (CaseInsensitive) - chTest = _culture.TextInfo.ToLower(chTest); - - if (chTest != Pattern[match]) - { - advance = Positive[match]; - if ((chTest & 0xFF80) == 0) - test2 = (match - startmatch) + NegativeASCII[chTest]; - else if (null != NegativeUnicode && (null != (unicodeLookup = NegativeUnicode[chTest >> 8]))) - test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; - else - { - test += advance; - break; - } - - if (RightToLeft ? test2 < advance : test2 > advance) - advance = test2; - - test += advance; - break; - } - } - } - } - } - -#if DEBUG - /// Used when dumping for debugging. - [ExcludeFromCodeCoverage] - public override string ToString() => Dump(string.Empty); - - [ExcludeFromCodeCoverage] - public string Dump(string indent) - { - var sb = new StringBuilder(); - - sb.AppendLine($"{indent}BM Pattern: {Pattern}"); - - sb.Append($"{indent}Positive: "); - foreach (int i in Positive) - { - sb.Append($"{i} "); - } - sb.AppendLine(); - - if (NegativeASCII != null) - { - sb.Append($"{indent}Negative table: "); - for (int i = 0; i < NegativeASCII.Length; i++) - { - if (NegativeASCII[i] != Pattern.Length) - { - sb.Append($" {{{Regex.Escape(((char)i).ToString())} {NegativeASCII[i]}}}"); - } - } - } - sb.AppendLine(); - - return sb.ToString(); - } -#endif - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 1c2cf0ff65817..0b6dafbebb3d2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; +using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions @@ -71,6 +72,7 @@ internal sealed partial class RegexCharClass internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet; internal const string AnyClass = "\x00\x01\x00\x00"; + private const string EmptyClass = "\x00\x00\x00"; // UnicodeCategory is zero based, so we add one to each value and subtract it off later private const int DefinedCategoriesCapacity = 38; @@ -874,9 +876,9 @@ public static bool ParticipatesInCaseConversion(int comparison) } } - /// Gets whether the specified string participates in case conversion. - /// The string participates in case conversion if any of its characters do. - public static bool ParticipatesInCaseConversion(string s) + /// Gets whether the specified span participates in case conversion. + /// The span participates in case conversion if any of its characters do. + public static bool ParticipatesInCaseConversion(ReadOnlySpan s) { foreach (char c in s) { @@ -890,6 +892,7 @@ public static bool ParticipatesInCaseConversion(string s) } /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. + /// This may enumerate negated characters if the set is negated. private static bool CanEasilyEnumerateSetContents(string set) => set.Length > SetStartIndex && set[SetLengthIndex] > 0 && @@ -1013,61 +1016,69 @@ public static bool IsWordChar(char ch) } } - public static bool CharInClass(char ch, string set, ref int[]? asciiResultCache) + /// Determines a character's membership in a character class (via the string representation of the class). + /// The character. + /// The string representation of the character class. + /// A lazily-populated cache for ASCII results stored in a 256-bit array. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache) { - // The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit ("known") in the pair - // says whether the second bit ("value") in the pair has already been computed. Once a value is computed, it's never + // The uint[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit in the pair + // says whether the second bit in the pair has already been computed. Once a value is computed, it's never // changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit // has been set. If the known bit hasn't been set, then we proceed to look it up, and then swap in the result. const int CacheArrayLength = 8; - Debug.Assert(asciiResultCache is null || asciiResultCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters"); + Debug.Assert(asciiLazyCache is null || asciiLazyCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters"); - if (ch < 128) + // If the value is ASCII and already has an answer for this value, use it. + if (asciiLazyCache is uint[] cache) { - // Lazily-initialize the cache for this set. - if (asciiResultCache is null) + int index = ch >> 4; + if ((uint)index < (uint)cache.Length) { - Interlocked.CompareExchange(ref asciiResultCache, new int[CacheArrayLength], null); + Debug.Assert(ch < 128); + uint current = cache[index]; + uint bit = 1u << ((ch & 0xF) << 1); + if ((current & bit) != 0) + { + return (current & (bit << 1)) != 0; + } } + } - // Determine which int in the lookup array contains the known and value bits for this character, - // and compute their bit numbers. - ref int slot = ref asciiResultCache[ch >> 4]; - int knownBit = 1 << ((ch & 0xF) << 1); - int valueBit = knownBit << 1; - - // If the value for this bit has already been computed, use it. - int current = slot; - if ((current & knownBit) != 0) - { - return (current & valueBit) != 0; - } + // For ASCII, lazily initialize. For non-ASCII, just compute the value. + return ch < 128 ? + InitializeValue(ch, set, ref asciiLazyCache) : + CharInClassRecursive(ch, set, 0); + static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache) + { // (After warm-up, we should find ourselves rarely getting here.) + Debug.Assert(ch < 128); - // Otherwise, compute it normally. + // Compute the result and determine which bits to write back to the array and "or" the bits back in a thread-safe manner. bool isInClass = CharInClass(ch, set); - - // Determine which bits to write back to the array and "or" the bits back in a thread-safe manner. - int bitsToSet = knownBit; + uint bitsToSet = 1u << ((ch & 0xF) << 1); if (isInClass) { - bitsToSet |= valueBit; + bitsToSet |= bitsToSet << 1; } + + uint[]? cache = asciiLazyCache ?? Interlocked.CompareExchange(ref asciiLazyCache, new uint[CacheArrayLength], null) ?? asciiLazyCache; #if REGEXGENERATOR - InterlockedExtensions.Or(ref slot, bitsToSet); + InterlockedExtensions.Or(ref cache[ch >> 4], bitsToSet); #else - Interlocked.Or(ref slot, bitsToSet); + Interlocked.Or(ref cache[ch >> 4], bitsToSet); #endif // Return the computed value. return isInClass; } - - // Non-ASCII. Fall back to computing the answer. - return CharInClassRecursive(ch, set, 0); } + /// + /// Determines a character's membership in a character class (via the string representation of the class). + /// public static bool CharInClass(char ch, string set) => CharInClassRecursive(ch, set, 0); @@ -1279,6 +1290,283 @@ private static RegexCharClass ParseRecursive(string charClass, int start) return new RegexCharClass(IsNegated(charClass, start), ranges, categoriesBuilder, sub); } + #region Perf workaround until https://github.com/dotnet/runtime/issues/61048 and https://github.com/dotnet/runtime/issues/59492 are addressed + // TODO: https://github.com/dotnet/runtime/issues/61048 + // The below functionality needs to be removed/replaced/generalized. The goal is to avoid relying on + // ToLower and culture-based operation at match time, and instead be able to compute at construction + // time case folding equivalence classes that let us determine up-front the set of characters considered + // valid for a match. For now, we do this just for ASCII, and for anything else fall back to the + // pre-existing mechanism whereby a culture is used at construction time to ToLower and then one is + // used at match time to ToLower. We also skip 'i' and 'I', as the casing of those varies across culture + // whereas every other ASCII value's casing is stable across culture. We could hardcode the values for + // when an invariant vs tr/az culture vs any other culture is used, and we likely will, but for now doing + // so would be a breaking change, as in doing so we'd be relying only on the culture present at the time + // of construction rather than the one at the time of match. That will be resolved with + // https://github.com/dotnet/runtime/issues/59492. + + /// Creates a set string for a single character, optionally factoring in case-insensitivity. + /// The character for which to create the set. + /// null if case-sensitive; non-null if case-insensitive, in which case it's the culture to use. + /// false if the caller should strip out RegexOptions.IgnoreCase because it's now fully represented by the set; otherwise, true. + /// The create set string. + public static string OneToStringClass(char c, CultureInfo? caseInsensitive, out bool resultIsCaseInsensitive) + { + var vsb = new ValueStringBuilder(stackalloc char[4]); + + if (caseInsensitive is null) + { + resultIsCaseInsensitive = false; + vsb.Append(c); + } + else if (c < 128 && (c | 0x20) != 'i') + { + resultIsCaseInsensitive = false; + switch (c) + { + // These are the same in all cultures. As with the rest of this support, we can generalize this + // once we fix the aforementioned casing issues, e.g. by lazily populating an interning cache + // rather than hardcoding the strings for these values, once almost all values will be the same + // regardless of culture. + case 'A': case 'a': return "\0\x0004\0ABab"; + case 'B': case 'b': return "\0\x0004\0BCbc"; + case 'C': case 'c': return "\0\x0004\0CDcd"; + case 'D': case 'd': return "\0\x0004\0DEde"; + case 'E': case 'e': return "\0\x0004\0EFef"; + case 'F': case 'f': return "\0\x0004\0FGfg"; + case 'G': case 'g': return "\0\x0004\0GHgh"; + case 'H': case 'h': return "\0\x0004\0HIhi"; + // allow 'i' to fall through + case 'J': case 'j': return "\0\x0004\0JKjk"; + case 'K': case 'k': return "\0\x0006\0KLkl\u212A\u212B"; + case 'L': case 'l': return "\0\x0004\0LMlm"; + case 'M': case 'm': return "\0\x0004\0MNmn"; + case 'N': case 'n': return "\0\x0004\0NOno"; + case 'O': case 'o': return "\0\x0004\0OPop"; + case 'P': case 'p': return "\0\x0004\0PQpq"; + case 'Q': case 'q': return "\0\x0004\0QRqr"; + case 'R': case 'r': return "\0\x0004\0RSrs"; + case 'S': case 's': return "\0\x0004\0STst"; + case 'T': case 't': return "\0\x0004\0TUtu"; + case 'U': case 'u': return "\0\x0004\0UVuv"; + case 'V': case 'v': return "\0\x0004\0VWvw"; + case 'W': case 'w': return "\0\x0004\0WXwx"; + case 'X': case 'x': return "\0\x0004\0XYxy"; + case 'Y': case 'y': return "\0\x0004\0YZyz"; + case 'Z': case 'z': return "\0\x0004\0Z[z{"; + + // All the ASCII !ParticipatesInCaseConversion + case '\u0000': return "\0\u0002\0\u0000\u0001"; + case '\u0001': return "\0\u0002\0\u0001\u0002"; + case '\u0002': return "\0\u0002\0\u0002\u0003"; + case '\u0003': return "\0\u0002\0\u0003\u0004"; + case '\u0004': return "\0\u0002\0\u0004\u0005"; + case '\u0005': return "\0\u0002\0\u0005\u0006"; + case '\u0006': return "\0\u0002\0\u0006\u0007"; + case '\u0007': return "\0\u0002\0\u0007\u0008"; + case '\u0008': return "\0\u0002\0\u0008\u0009"; + case '\u0009': return "\0\u0002\0\u0009\u000A"; + case '\u000A': return "\0\u0002\0\u000A\u000B"; + case '\u000B': return "\0\u0002\0\u000B\u000C"; + case '\u000C': return "\0\u0002\0\u000C\u000D"; + case '\u000D': return "\0\u0002\0\u000D\u000E"; + case '\u000E': return "\0\u0002\0\u000E\u000F"; + case '\u000F': return "\0\u0002\0\u000F\u0010"; + case '\u0010': return "\0\u0002\0\u0010\u0011"; + case '\u0011': return "\0\u0002\0\u0011\u0012"; + case '\u0012': return "\0\u0002\0\u0012\u0013"; + case '\u0013': return "\0\u0002\0\u0013\u0014"; + case '\u0014': return "\0\u0002\0\u0014\u0015"; + case '\u0015': return "\0\u0002\0\u0015\u0016"; + case '\u0016': return "\0\u0002\0\u0016\u0017"; + case '\u0017': return "\0\u0002\0\u0017\u0018"; + case '\u0018': return "\0\u0002\0\u0018\u0019"; + case '\u0019': return "\0\u0002\0\u0019\u001A"; + case '\u001A': return "\0\u0002\0\u001A\u001B"; + case '\u001B': return "\0\u0002\0\u001B\u001C"; + case '\u001C': return "\0\u0002\0\u001C\u001D"; + case '\u001D': return "\0\u0002\0\u001D\u001E"; + case '\u001E': return "\0\u0002\0\u001E\u001F"; + case '\u001F': return "\0\u0002\0\u001F\u0020"; + case '\u0020': return "\0\u0002\0\u0020\u0021"; + case '\u0021': return "\0\u0002\0\u0021\u0022"; + case '\u0022': return "\0\u0002\0\u0022\u0023"; + case '\u0023': return "\0\u0002\0\u0023\u0024"; + case '\u0025': return "\0\u0002\0\u0025\u0026"; + case '\u0026': return "\0\u0002\0\u0026\u0027"; + case '\u0027': return "\0\u0002\0\u0027\u0028"; + case '\u0028': return "\0\u0002\0\u0028\u0029"; + case '\u0029': return "\0\u0002\0\u0029\u002A"; + case '\u002A': return "\0\u0002\0\u002A\u002B"; + case '\u002C': return "\0\u0002\0\u002C\u002D"; + case '\u002D': return "\0\u0002\0\u002D\u002E"; + case '\u002E': return "\0\u0002\0\u002E\u002F"; + case '\u002F': return "\0\u0002\0\u002F\u0030"; + case '\u0030': return "\0\u0002\0\u0030\u0031"; + case '\u0031': return "\0\u0002\0\u0031\u0032"; + case '\u0032': return "\0\u0002\0\u0032\u0033"; + case '\u0033': return "\0\u0002\0\u0033\u0034"; + case '\u0034': return "\0\u0002\0\u0034\u0035"; + case '\u0035': return "\0\u0002\0\u0035\u0036"; + case '\u0036': return "\0\u0002\0\u0036\u0037"; + case '\u0037': return "\0\u0002\0\u0037\u0038"; + case '\u0038': return "\0\u0002\0\u0038\u0039"; + case '\u0039': return "\0\u0002\0\u0039\u003A"; + case '\u003A': return "\0\u0002\0\u003A\u003B"; + case '\u003B': return "\0\u0002\0\u003B\u003C"; + case '\u003F': return "\0\u0002\0\u003F\u0040"; + case '\u0040': return "\0\u0002\0\u0040\u0041"; + case '\u005B': return "\0\u0002\0\u005B\u005C"; + case '\u005C': return "\0\u0002\0\u005C\u005D"; + case '\u005D': return "\0\u0002\0\u005D\u005E"; + case '\u005F': return "\0\u0002\0\u005F\u0060"; + case '\u007B': return "\0\u0002\0\u007B\u007C"; + case '\u007D': return "\0\u0002\0\u007D\u007E"; + case '\u007F': return "\0\u0002\0\u007F\u0080"; + } + AddAsciiCharIgnoreCaseEquivalence(c, ref vsb, caseInsensitive); + } + else if (!ParticipatesInCaseConversion(c)) + { + resultIsCaseInsensitive = false; + vsb.Append(c); + } + else + { + resultIsCaseInsensitive = true; + vsb.Append(char.ToLower(c, caseInsensitive)); + } + + string result = CharsToStringClass(vsb.AsSpan()); + vsb.Dispose(); + return result; + } + + private static unsafe string CharsToStringClass(ReadOnlySpan chars) + { +#if DEBUG + // Make sure they're all sorted with no duplicates + for (int index = 0; index < chars.Length - 1; index++) + { + Debug.Assert(chars[index] < chars[index + 1]); + } +#endif + + // If there aren't any chars, just return an empty class. + if (chars.Length == 0) + { + return EmptyClass; + } + + // Count how many characters there actually are. All but the very last possible + // char value will have two characters, one for the inclusive beginning of range + // and one for the exclusive end of range. + int count = chars.Length * 2; + if (chars[chars.Length - 1] == LastChar) + { + count--; + } + + // Get the pointer/length of the span to be able to pass it into string.Create. + fixed (char* charsPtr = chars) + { +#if REGEXGENERATOR + return StringExtensions.Create( +#else + return string.Create( +#endif + SetStartIndex + count, ((IntPtr)charsPtr, chars.Length), static (span, state) => + { + // Reconstruct the span now that we're inside of the lambda. + ReadOnlySpan chars = new ReadOnlySpan((char*)state.Item1, state.Length); + + // Fill in the set string + span[FlagsIndex] = (char)0; + span[CategoryLengthIndex] = (char)0; + span[SetLengthIndex] = (char)(span.Length - SetStartIndex); + int i = SetStartIndex; + foreach (char c in chars) + { + span[i++] = c; + if (c != LastChar) + { + span[i++] = (char)(c + 1); + } + } + Debug.Assert(i == span.Length); + }); + } + } + + /// Tries to create from a RegexOptions.IgnoreCase set string a new set string that can be used without RegexOptions.IgnoreCase. + /// The original set string from a RegexOptions.IgnoreCase node. + /// The culture in use. + /// A new set string if one could be created. + public static string? MakeCaseSensitiveIfPossible(string set, CultureInfo culture) + { + if (IsNegated(set)) + { + return null; + } + + // We'll eventually need a more robust way to do this for any set. For now, we iterate through each character + // in the set, and to avoid spending lots of time doing so, we limit the number of characters. This approach also + // limits the structure of the sets allowed, e.g. they can't be negated, can't use subtraction, etc. + Span setChars = stackalloc char[64]; // arbitary limit chosen to include common groupings like all ASCII letters and digits + + // Try to get the set's characters. + int setCharsCount = GetSetChars(set, setChars); + if (setCharsCount == 0) + { + return null; + } + + // Enumerate all the characters and add all characters that form their case folding equivalence class. + var rcc = new RegexCharClass(); + var vsb = new ValueStringBuilder(stackalloc char[4]); + foreach (char c in setChars.Slice(0, setCharsCount)) + { + if (c >= 128 || c == 'i' || c == 'I') + { + return null; + } + + vsb.Length = 0; + AddAsciiCharIgnoreCaseEquivalence(c, ref vsb, culture); + foreach (char v in vsb.AsSpan()) + { + rcc.AddChar(v); + } + } + + // Return the constructed class. + return rcc.ToStringClass(); + } + + private static void AddAsciiCharIgnoreCaseEquivalence(char c, ref ValueStringBuilder vsb, CultureInfo culture) + { + Debug.Assert(c < 128, $"Expected ASCII, got {(int)c}"); + Debug.Assert(c != 'i' && c != 'I', "'i' currently doesn't work correctly in all cultures"); + + char upper = char.ToUpper(c, culture); + char lower = char.ToLower(c, culture); + + if (upper < lower) + { + vsb.Append(upper); + } + vsb.Append(lower); + if (upper > lower) + { + vsb.Append(upper); + } + + if (c == 'k' || c == 'K') + { + vsb.Append((char)0x212A); // kelvin sign + } + } + #endregion + /// /// Constructs the string representation of the class. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index d8700ebb9bd28..6f6c8cd8f8852 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -16,6 +16,7 @@ using System.Collections; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Globalization; namespace System.Text.RegularExpressions { @@ -96,35 +97,25 @@ internal sealed class RegexCode public readonly RegexTree Tree; // the optimized parse tree public readonly int[] Codes; // the code public readonly string[] Strings; // the string/set table - public readonly int[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings + public readonly uint[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings public readonly int TrackCount; // how many instructions use backtracking public readonly Hashtable? Caps; // mapping of user group numbers -> impl group slots public readonly int CapSize; // number of impl group slots - public readonly (string CharClass, bool CaseInsensitive)[]? LeadingCharClasses; // the set of candidate first characters, if available. Each entry corresponds to the next char in the input. - public int[]? LeadingCharClassAsciiLookup; // the ASCII lookup table optimization for LeadingCharClasses[0], if it exists; only used by the interpreter - public readonly RegexBoyerMoore? BoyerMoorePrefix; // the fixed prefix string as a Boyer-Moore machine, if available - public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc) public readonly bool RightToLeft; // true if right to left + public readonly RegexFindOptimizations FindOptimizations; - public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount, - Hashtable? caps, int capsize, - RegexBoyerMoore? boyerMoorePrefix, - (string CharClass, bool CaseInsensitive)[]? leadingCharClasses, - int leadingAnchor, bool rightToLeft) + public RegexCode(RegexTree tree, CultureInfo culture, int[] codes, string[] strings, int trackcount, + Hashtable? caps, int capsize) { - Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null); - Tree = tree; Codes = codes; Strings = strings; - StringsAsciiLookup = new int[strings.Length][]; + StringsAsciiLookup = new uint[strings.Length][]; TrackCount = trackcount; Caps = caps; CapSize = capsize; - BoyerMoorePrefix = boyerMoorePrefix; - LeadingCharClasses = leadingCharClasses; - LeadingAnchor = leadingAnchor; - RightToLeft = rightToLeft; + RightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0; + FindOptimizations = new RegexFindOptimizations(tree, culture); } public static bool OpcodeBacktracks(int Op) @@ -409,26 +400,8 @@ public override string ToString() var sb = new StringBuilder(); sb.AppendLine($"Direction: {(RightToLeft ? "right-to-left" : "left-to-right")}"); - sb.AppendLine($"Anchor: {RegexPrefixAnalyzer.AnchorDescription(LeadingAnchor)}"); + sb.AppendLine($"Anchor: {RegexPrefixAnalyzer.AnchorDescription(FindOptimizations.LeadingAnchor)}"); sb.AppendLine(); - - if (BoyerMoorePrefix != null) - { - sb.AppendLine("Boyer-Moore:"); - sb.AppendLine(BoyerMoorePrefix.Dump(" ")); - sb.AppendLine(); - } - - if (LeadingCharClasses != null) - { - sb.AppendLine("First Chars:"); - for (int i = 0; i < LeadingCharClasses.Length; i++) - { - sb.AppendLine($"{i}: {RegexCharClass.SetDescription(LeadingCharClasses[i].CharClass)}"); - } - sb.AppendLine(); - } - for (int i = 0; i < Codes.Length; i += OpcodeSize(Codes[i])) { sb.AppendLine(OpcodeDescription(i)); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index e288567bf14a7..49b88d3954cab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -62,6 +62,9 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -90,9 +93,6 @@ internal abstract class RegexCompiler protected RegexCode? _code; // the RegexCode object protected int[]? _codes; // the RegexCodes being translated protected string[]? _strings; // the stringtable associated with the RegexCodes - protected (string CharClass, bool CaseInsensitive)[]? _leadingCharClasses; // the possible first chars computed by RegexPrefixAnalyzer - protected RegexBoyerMoore? _boyerMoorePrefix; // a prefix as a boyer-moore machine - protected int _leadingAnchor; // the set of anchors protected bool _hasTimeout; // whether the regex has a non-infinite timeout private Label[]? _labels; // a label for every operation in _codes @@ -928,20 +928,20 @@ protected void GenerateFindFirstChar() } _runtextLocal = DeclareString(); _textInfoLocal = null; - if (!_options.HasFlag(RegexOptions.CultureInvariant)) + if ((_options & RegexOptions.CultureInvariant) == 0) { - bool needsCulture = _options.HasFlag(RegexOptions.IgnoreCase) || _boyerMoorePrefix?.CaseInsensitive == true; - if (!needsCulture && _leadingCharClasses != null) + bool needsCulture = _code.FindOptimizations.FindMode switch { - for (int i = 0; i < _leadingCharClasses.Length; i++) - { - if (_leadingCharClasses[i].CaseInsensitive) - { - needsCulture = true; - break; - } - } - } + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or + FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true, + + _ when _code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), + + _ => false, + }; if (needsCulture) { @@ -1012,43 +1012,59 @@ protected void GenerateFindFirstChar() Ret(); MarkLabel(finishedLengthCheck); - GenerateAnchorChecks(); - - if (_boyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm) - { - if (rbm.PatternSupportsIndexOf) - { - GenerateIndexOf(rbm.Pattern); - } - else - { - GenerateBoyerMoore(rbm); - } - } - else if (_leadingCharClasses is not null) + // Emit any anchors. + if (GenerateAnchors()) { - if (_code.RightToLeft) - { - GenerateLeadingCharacter_RightToLeft(); - } - else - { - GenerateLeadingCharacter_LeftToRight(); - } + return; } - else + + // Either anchors weren't specified, or they don't completely root all matches to a specific location. + + switch (_code.FindOptimizations.FindMode) { - // return true; - Ldc(1); - Ret(); + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); + GenerateIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); + GenerateIndexOf_RightToLeft(_code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + GenerateFixedSet_LeftToRight(); + break; + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + GenerateFixedSet_RightToLeft(); + break; + + default: + Debug.Fail($"Unexpected mode: {_code.FindOptimizations.FindMode}"); + goto case FindNextStartingPositionMode.NoSearch; + + case FindNextStartingPositionMode.NoSearch: + // return true; + Ldc(1); + Ret(); + break; } - void GenerateAnchorChecks() + // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further + // searching is required; otherwise, false. + bool GenerateAnchors() { // Generate anchor checks. - if ((_leadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) + if ((_code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) { - switch (_leadingAnchor) + switch (_code.FindOptimizations.LeadingAnchor) { case RegexPrefixAnalyzer.Beginning: { @@ -1072,7 +1088,7 @@ void GenerateAnchorChecks() } Ldc(1); Ret(); - return; + return true; case RegexPrefixAnalyzer.Start: { @@ -1092,7 +1108,7 @@ void GenerateAnchorChecks() } Ldc(1); Ret(); - return; + return true; case RegexPrefixAnalyzer.EndZ: { @@ -1134,9 +1150,9 @@ void GenerateAnchorChecks() } Ldc(1); Ret(); - return; + return true; - case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check + case RegexPrefixAnalyzer.End: { Label l1 = DefineLabel(); Ldloc(_runtextposLocal); @@ -1157,16 +1173,16 @@ void GenerateAnchorChecks() } Ldc(1); Ret(); - return; + return true; - case RegexPrefixAnalyzer.Bol when !_code.RightToLeft: // don't bother optimizing for the niche case of RegexOptions.RightToLeft | RegexOptions.Multiline + case RegexPrefixAnalyzer.Bol: { // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any Boyer-Moore or - // leading char class searches. + // to boost our position to the next line, and then continue normally with any prefix or char class searches. + Debug.Assert(!_code.RightToLeft, "RightToLeft isn't implemented and should have been filtered out previously"); Label atBeginningOfLine = DefineLabel(); // if (runtextpos > runtextbeg... @@ -1218,218 +1234,11 @@ void GenerateAnchorChecks() break; } } - } - - void GenerateBoyerMoore(RegexBoyerMoore rbm) - { - LocalBuilder limitLocal; - int beforefirst; - int last; - if (!_code.RightToLeft) - { - limitLocal = _runtextendLocal; - beforefirst = -1; - last = rbm.Pattern.Length - 1; - } - else - { - limitLocal = _runtextbegLocal!; - beforefirst = rbm.Pattern.Length; - last = 0; - } - - int chLast = rbm.Pattern[last]; - // string runtext = this.runtext; - Mvfldloc(s_runtextField, _runtextLocal); - - // runtextpos += pattern.Length - 1; // advance to match last character - Ldloc(_runtextposLocal); - if (!_code.RightToLeft) - { - Ldc(rbm.Pattern.Length - 1); - Add(); - } - else - { - Ldc(rbm.Pattern.Length); - Sub(); - } - Stloc(_runtextposLocal); - - Label lStart = DefineLabel(); - Br(lStart); - - // DefaultAdvance: - // offset = pattern.Length; - Label lDefaultAdvance = DefineLabel(); - MarkLabel(lDefaultAdvance); - Ldc(_code.RightToLeft ? -rbm.Pattern.Length : rbm.Pattern.Length); - - // Advance: - // runtextpos += offset; - Label lAdvance = DefineLabel(); - MarkLabel(lAdvance); - Ldloc(_runtextposLocal); - Add(); - Stloc(_runtextposLocal); - - // Start: - // if (runtextpos >= runtextend) goto returnFalse; - MarkLabel(lStart); - Ldloc(_runtextposLocal); - Ldloc(limitLocal); - if (!_code.RightToLeft) - { - BgeFar(returnFalse); - } - else - { - BltFar(returnFalse); - } - - // ch = runtext[runtextpos]; - Rightchar(); - if (rbm.CaseInsensitive) - { - CallToLower(); - } - - Label lPartialMatch = DefineLabel(); - using (RentedLocalBuilder chLocal = RentInt32Local()) - { - Stloc(chLocal); - Ldloc(chLocal); - Ldc(chLast); - - // if (ch == lastChar) goto partialMatch; - BeqFar(lPartialMatch); - - // ch -= lowAscii; - // if (ch > (highAscii - lowAscii)) goto defaultAdvance; - Ldloc(chLocal); - Ldc(rbm.LowASCII); - Sub(); - Stloc(chLocal); - Ldloc(chLocal); - Ldc(rbm.HighASCII - rbm.LowASCII); - BgtUn(lDefaultAdvance); - - // int offset = "lookupstring"[num]; - // goto advance; - int negativeRange = rbm.HighASCII - rbm.LowASCII + 1; - if (negativeRange > 1) - { - // Create a string to store the lookup table we use to find the offset. - Debug.Assert(rbm.Pattern.Length <= char.MaxValue, "RegexBoyerMoore should have limited the size allowed."); - string negativeLookup = string.Create(negativeRange, (rbm, beforefirst), static (span, state) => - { - // Store the offsets into the string. RightToLeft has negative offsets, so to support it with chars (unsigned), we negate - // the values to be stored in the string, and then at run time after looking up the offset in the string, negate it again. - for (int i = 0; i < span.Length; i++) - { - int offset = state.rbm.NegativeASCII[i + state.rbm.LowASCII]; - if (offset == state.beforefirst) - { - offset = state.rbm.Pattern.Length; - } - else if (state.rbm.RightToLeft) - { - offset = -offset; - } - Debug.Assert(offset >= 0 && offset <= char.MaxValue); - span[i] = (char)offset; - } - }); - - // offset = lookupString[ch]; - // goto Advance; - Ldstr(negativeLookup); - Ldloc(chLocal); - Call(s_stringGetCharsMethod); - if (_code.RightToLeft) - { - Neg(); - } - } - else - { - // offset = value; - Debug.Assert(negativeRange == 1); - int offset = rbm.NegativeASCII[rbm.LowASCII]; - if (offset == beforefirst) - { - offset = _code.RightToLeft ? -rbm.Pattern.Length : rbm.Pattern.Length; - } - Ldc(offset); - } - BrFar(lAdvance); - } - - // Emit a check for each character from the next to last down to the first. - MarkLabel(lPartialMatch); - Ldloc(_runtextposLocal); - using (RentedLocalBuilder testLocal = RentInt32Local()) - { - Stloc(testLocal); - - int prevLabelOffset = int.MaxValue; - Label prevLabel = default; - for (int i = rbm.Pattern.Length - 2; i >= 0; i--) - { - int charindex = _code.RightToLeft ? rbm.Pattern.Length - 1 - i : i; - - // if (runtext[--test] == pattern[index]) goto lNext; - Ldloc(_runtextLocal); - Ldloc(testLocal); - Ldc(1); - Sub(_code.RightToLeft); - Stloc(testLocal); - Ldloc(testLocal); - Call(s_stringGetCharsMethod); - if (rbm.CaseInsensitive && RegexCharClass.ParticipatesInCaseConversion(rbm.Pattern[charindex])) - { - CallToLower(); - } - Ldc(rbm.Pattern[charindex]); - - if (prevLabelOffset == rbm.Positive[charindex]) - { - BneFar(prevLabel); - } - else - { - Label lNext = DefineLabel(); - Beq(lNext); - - // offset = positive[ch]; - // goto advance; - prevLabel = DefineLabel(); - prevLabelOffset = rbm.Positive[charindex]; - MarkLabel(prevLabel); - Ldc(prevLabelOffset); - BrFar(lAdvance); - - MarkLabel(lNext); - } - } - - // this.runtextpos = test; - // return true; - Ldthis(); - Ldloc(testLocal); - if (_code.RightToLeft) - { - Ldc(1); - Add(); - } - Stfld(s_runtextposField); - Ldc(1); - Ret(); - } + return false; } - void GenerateIndexOf(string prefix) + void GenerateIndexOf_LeftToRight(string prefix) { using RentedLocalBuilder i = RentInt32Local(); @@ -1446,11 +1255,7 @@ void GenerateIndexOf(string prefix) Call(s_spanIndexOfSpan); Stloc(i); - // if (i < 0) - // { - // base.runtextpos = runtextend; - // return false; - // } + // if (i < 0) goto ReturnFalse; Ldloc(i); Ldc(0); BltFar(returnFalse); @@ -1466,105 +1271,135 @@ void GenerateIndexOf(string prefix) Ret(); } - void GenerateLeadingCharacter_RightToLeft() + void GenerateIndexOf_RightToLeft(string prefix) { - Debug.Assert(_leadingCharClasses.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft"); - - using RentedLocalBuilder cLocal = RentInt32Local(); - - Label l1 = DefineLabel(); - Label l2 = DefineLabel(); - Label l3 = DefineLabel(); - Label l4 = DefineLabel(); - Label l5 = DefineLabel(); - - Mvfldloc(s_runtextField, _runtextLocal); + using RentedLocalBuilder i = RentInt32Local(); + // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(prefix); + Ldthis(); + Ldfld(s_runtextField); + Ldloc(_runtextbegLocal!); Ldloc(_runtextposLocal); Ldloc(_runtextbegLocal!); Sub(); - Stloc(cLocal); + Call(s_stringAsSpanIntIntMethod); + Ldstr(prefix); + Call(s_stringAsSpanMethod); + Call(s_spanLastIndexOfSpan); + Stloc(i); - if (minRequiredLength == 0) // if minRequiredLength > 0, we already output a more stringent check - { - Ldloc(cLocal); - Ldc(0); - BleFar(l4); - } + // if (i < 0) goto ReturnFalse; + Ldloc(i); + Ldc(0); + BltFar(returnFalse); - MarkLabel(l1); - Ldloc(cLocal); + // base.runtextpos = runtextbeg + i + LeadingCaseSensitivePrefix.Length; + // return true; + Ldthis(); + Ldloc(_runtextbegLocal!); + Ldloc(i); + Add(); + Ldc(prefix.Length); + Add(); + Stfld(s_runtextposField); Ldc(1); - Sub(); - Stloc(cLocal); + Ret(); + } - Leftcharnext(); + void GenerateFixedSet_RightToLeft() + { + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = _code.FindOptimizations.FixedDistanceSets![0]; + Debug.Assert(set.Distance == 0); - if (!RegexCharClass.IsSingleton(_leadingCharClasses[0].CharClass)) - { - EmitMatchCharacterClass(_leadingCharClasses[0].CharClass, _leadingCharClasses[0].CaseInsensitive); - Brtrue(l2); - } - else + using RentedLocalBuilder i = RentInt32Local(); + + if (set.Chars is { Length: 1 } && !set.CaseInsensitive) { - Ldc(RegexCharClass.SingletonChar(_leadingCharClasses[0].CharClass)); - Beq(l2); - } + // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(set.Chars[0]); + Ldthis(); + Ldfld(s_runtextField); + Ldloc(_runtextbegLocal!); + Ldloc(_runtextposLocal); + Ldloc(_runtextbegLocal!); + Sub(); + Call(s_stringAsSpanIntIntMethod); + Ldc(set.Chars[0]); + Call(s_spanLastIndexOfChar); + Stloc(i); - MarkLabel(l5); + // if (i < 0) goto ReturnFalse; + Ldloc(i); + Ldc(0); + BltFar(returnFalse); - Ldloc(cLocal); - Ldc(0); - if (!RegexCharClass.IsSingleton(_leadingCharClasses[0].CharClass)) - { - BgtFar(l1); + // base.runtextpos = runtextbeg + i + 1; + // return true; + Ldthis(); + Ldloc(_runtextbegLocal!); + Ldloc(i); + Add(); + Ldc(1); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); } else { - Bgt(l1); - } + Label condition = DefineLabel(); + Label increment = DefineLabel(); + Label body = DefineLabel(); - Ldc(0); - Br(l3); + Mvfldloc(s_runtextField, _runtextLocal); - MarkLabel(l2); - - Ldloc(_runtextposLocal); - Ldc(1); - Sub(_code.RightToLeft); - Stloc(_runtextposLocal); - Ldc(1); + // for (int i = runtextpos - 1; ... + Ldloc(_runtextposLocal); + Ldc(1); + Sub(); + Stloc(i); + BrFar(condition); + + // if (MatchCharClass(runtext[i], set)) + MarkLabel(body); + Ldloc(_runtextLocal); + Ldloc(i); + Call(s_stringGetCharsMethod); + EmitMatchCharacterClass(set.Set, set.CaseInsensitive); + Brfalse(increment); + + // base.runtextpos = i + 1; + // return true; + Ldthis(); + Ldloc(i); + Ldc(1); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); - MarkLabel(l3); + // for (...; ...; i--) + MarkLabel(increment); + Ldloc(i); + Ldc(1); + Sub(); + Stloc(i); - Mvlocfld(_runtextposLocal, s_runtextposField); - Ret(); + // for (...; i >= runtextbeg; ...) + MarkLabel(condition); + Ldloc(i); + Ldloc(_runtextbegLocal!); + BgeFar(body); - MarkLabel(l4); - Ldc(0); - Ret(); + BrFar(returnFalse); + } } - void GenerateLeadingCharacter_LeftToRight() + void GenerateFixedSet_LeftToRight() { - Debug.Assert(_leadingCharClasses != null && _leadingCharClasses.Length > 0); - - // If minRequiredLength > 0, we already output a more stringent check. In the rare case - // where we were unable to get an accurate enough min required length to ensure it's larger - // than the prefixes we calculated, we also need to ensure we have enough spaces for those, - // as they also represent a min required length. - if (minRequiredLength < _leadingCharClasses.Length) - { - // if (runtextpos >= runtextend - (_leadingCharClasses.Length - 1)) goto returnFalse; - Ldloc(_runtextendLocal); - if (_leadingCharClasses.Length > 1) - { - Ldc(_leadingCharClasses.Length - 1); - Sub(); - } - Ldloc(_runtextposLocal); - BleFar(returnFalse); - } + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _code.FindOptimizations.FixedDistanceSets; + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; + const int MaxSets = 4; + int setsToUse = Math.Min(sets.Count, MaxSets); using RentedLocalBuilder iLocal = RentInt32Local(); using RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal(); @@ -1580,13 +1415,9 @@ void GenerateLeadingCharacter_LeftToRight() // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. - Span setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below - int setCharsCount = 0, charClassIndex = 0; - bool canUseIndexOf = - !_leadingCharClasses[0].CaseInsensitive && - (setCharsCount = RegexCharClass.GetSetChars(_leadingCharClasses[0].CharClass, setChars)) > 0 && - !RegexCharClass.IsNegated(_leadingCharClasses[0].CharClass); - bool needLoop = !canUseIndexOf || _leadingCharClasses.Length > 1; + int setIndex = 0; + bool canUseIndexOf = !primarySet.CaseInsensitive && primarySet.Chars is not null; + bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; Label charNotInClassLabel = default; @@ -1606,13 +1437,25 @@ void GenerateLeadingCharacter_LeftToRight() if (canUseIndexOf) { - charClassIndex = 1; + setIndex = 1; if (needLoop) { - // textSpan.Slice(iLocal) + // textSpan.Slice(iLocal + primarySet.Distance); Ldloca(textSpanLocal); Ldloc(iLocal); + if (primarySet.Distance != 0) + { + Ldc(primarySet.Distance); + Add(); + } + Call(s_spanSliceIntMethod); + } + else if (primarySet.Distance != 0) + { + // textSpan.Slice(primarySet.Distance) + Ldloca(textSpanLocal); + Ldc(primarySet.Distance); Call(s_spanSliceIntMethod); } else @@ -1621,29 +1464,34 @@ void GenerateLeadingCharacter_LeftToRight() Ldloc(textSpanLocal); } - switch (setCharsCount) + switch (primarySet.Chars!.Length) { case 1: // tmp = ...IndexOf(setChars[0]); - Ldc(setChars[0]); + Ldc(primarySet.Chars[0]); Call(s_spanIndexOfChar); break; case 2: // tmp = ...IndexOfAny(setChars[0], setChars[1]); - Ldc(setChars[0]); - Ldc(setChars[1]); + Ldc(primarySet.Chars[0]); + Ldc(primarySet.Chars[1]); Call(s_spanIndexOfAnyCharChar); break; - default: // 3 + case 3: // tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]}); - Debug.Assert(setCharsCount == 3); - Ldc(setChars[0]); - Ldc(setChars[1]); - Ldc(setChars[2]); + Ldc(primarySet.Chars[0]); + Ldc(primarySet.Chars[1]); + Ldc(primarySet.Chars[2]); Call(s_spanIndexOfAnyCharCharChar); break; + + default: + Ldstr(new string(primarySet.Chars)); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfAnySpan); + break; } if (needLoop) @@ -1672,13 +1520,13 @@ void GenerateLeadingCharacter_LeftToRight() BltFar(returnFalse); } - // if (i >= textSpan.Length - (_leadingCharClasses.Length - 1)) goto returnFalse; - if (_leadingCharClasses.Length > 1) + // if (i >= textSpan.Length - (minRequiredLength - 1)) goto returnFalse; + if (sets.Count > 1) { Debug.Assert(needLoop); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - Ldc(_leadingCharClasses.Length - 1); + Ldc(minRequiredLength - 1); Sub(); Ldloc(iLocal); BleFar(returnFalse); @@ -1689,20 +1537,20 @@ void GenerateLeadingCharacter_LeftToRight() // if (!CharInClass(textSpan[i + 1], prefix[1], "...")) continue; // if (!CharInClass(textSpan[i + 2], prefix[2], "...")) continue; // ... - Debug.Assert(charClassIndex == 0 || charClassIndex == 1); - for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++) + Debug.Assert(setIndex == 0 || setIndex == 1); + for ( ; setIndex < sets.Count; setIndex++) { Debug.Assert(needLoop); Ldloca(textSpanLocal); Ldloc(iLocal); - if (charClassIndex > 0) + if (sets[setIndex].Distance != 0) { - Ldc(charClassIndex); + Ldc(sets[setIndex].Distance); Add(); } Call(s_spanGetItemMethod); LdindU2(); - EmitMatchCharacterClass(_leadingCharClasses[charClassIndex].CharClass, _leadingCharClasses[charClassIndex].CaseInsensitive); + EmitMatchCharacterClass(sets[setIndex].Set, sets[setIndex].CaseInsensitive); BrfalseFar(charNotInClassLabel); } @@ -1726,14 +1574,14 @@ void GenerateLeadingCharacter_LeftToRight() Add(); Stloc(iLocal); - // for (...; i < span.Length - (_leadingCharClasses.Length - 1); ...); + // for (...; i < span.Length - (minRequiredLength - 1); ...); MarkLabel(checkSpanLengthLabel); Ldloc(iLocal); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - if (_leadingCharClasses.Length > 1) + if (setsToUse > 1 || primarySet.Distance != 0) { - Ldc(_leadingCharClasses.Length - 1); + Ldc(minRequiredLength - 1); Sub(); } BltFar(loopBody); @@ -2278,23 +2126,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Concatenate: - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) - { - EmitSpanLengthCheck(requiredLength); - for (; i < exclusiveEnd; i++) - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); - } - - i--; - continue; - } - - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent); - } + EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; case RegexNode.Capture: @@ -2339,6 +2171,28 @@ void EmitUpdateBumpalong() Stfld(s_runtextposField); } + // Emits code for a concatenation + void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) + { + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + { + EmitSpanLengthCheck(requiredLength); + for (; i < exclusiveEnd; i++) + { + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); + } + + i--; + continue; + } + + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent); + } + } + // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null) { @@ -2362,7 +2216,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o } else { - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + if (IsCaseInsensitive(node)) { CallToLower(); } @@ -2595,7 +2449,7 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) EmitTextSpanOffset(); textSpanPos++; LdindU2(); - if (caseInsensitive && RegexCharClass.ParticipatesInCaseConversion(s[i])) + if (caseInsensitive) { CallToLower(); } @@ -2865,12 +2719,12 @@ void EmitSingleCharAtomicLoop(RegexNode node) Label atomicLoopDoneLabel = DefineLabel(); - Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny + Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars = 0; if (node.IsNotoneFamily && maxIterations == int.MaxValue && - (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) + (!IsCaseInsensitive(node))) { // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, @@ -2911,14 +2765,15 @@ void EmitSingleCharAtomicLoop(RegexNode node) else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0 && RegexCharClass.IsNegated(node.Str!)) { - // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would + // If the set is negated and contains only a few characters (if it contained 1 and was negated, it would // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. + Debug.Assert(numSetChars > 1); - // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2{, ch3}); + // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2, ...); if (textSpanPos > 0) { Ldloca(textSpanLocal); @@ -2929,17 +2784,26 @@ void EmitSingleCharAtomicLoop(RegexNode node) { Ldloc(textSpanLocal); } - Ldc(setChars[0]); - Ldc(setChars[1]); - if (numSetChars == 2) - { - Call(s_spanIndexOfAnyCharChar); - } - else + switch (numSetChars) { - Debug.Assert(numSetChars == 3); - Ldc(setChars[2]); - Call(s_spanIndexOfAnyCharCharChar); + case 2: + Ldc(setChars[0]); + Ldc(setChars[1]); + Call(s_spanIndexOfAnyCharChar); + break; + + case 3: + Ldc(setChars[0]); + Ldc(setChars[1]); + Ldc(setChars[2]); + Call(s_spanIndexOfAnyCharCharChar); + break; + + default: + Ldstr(setChars.Slice(0, numSetChars).ToString()); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfSpan); + break; } Stloc(iterationLocal); @@ -3008,7 +2872,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) } else { - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + if (IsCaseInsensitive(node)) { CallToLower(); } @@ -3095,7 +2959,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) } else { - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + if (IsCaseInsensitive(node)) { CallToLower(); } @@ -4185,7 +4049,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4231,7 +4095,7 @@ private void GenerateOneCode() Add(); } Call(s_stringGetCharsMethod); - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i])) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4274,7 +4138,7 @@ private void GenerateOneCode() Ldc(str.Length - i); Sub(); Call(s_stringGetCharsMethod); - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i])) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4477,7 +4341,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4579,14 +4443,14 @@ private void GenerateOneCode() Label loopEnd = DefineLabel(); string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? _strings![Operand(0)] : null; - Span setChars = stackalloc char[3]; + Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars; // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive, // we can use the vectorized IndexOf to search for the target character. if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) && !IsRightToLeft() && - (!IsCaseInsensitive() || !RegexCharClass.ParticipatesInCaseConversion(Operand(0)))) + (!IsCaseInsensitive())) { // i = runtext.AsSpan(runtextpos, len).IndexOf(ch); Ldloc(_runtextLocal!); @@ -4633,29 +4497,39 @@ private void GenerateOneCode() else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && !IsRightToLeft() && !IsCaseInsensitive() && - (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 && RegexCharClass.IsNegated(set!)) { // Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive, - // and if the set contains only 2 or 3 negated chars, we can use the vectorized IndexOfAny + // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny // to search for those chars. + Debug.Assert(numSetChars > 1); // i = runtext.AsSpan(runtextpos, len).IndexOfAny(ch1, ch2{, ch3}); Ldloc(_runtextLocal!); Ldloc(_runtextposLocal!); Ldloc(lenLocal); Call(s_stringAsSpanIntIntMethod); - Ldc(setChars[0]); - Ldc(setChars[1]); - if (numSetChars == 2) - { - Call(s_spanIndexOfAnyCharChar); - } - else + switch (numSetChars) { - Debug.Assert(numSetChars == 3); - Ldc(setChars[2]); - Call(s_spanIndexOfAnyCharCharChar); + case 2: + Ldc(setChars[0]); + Ldc(setChars[1]); + Call(s_spanIndexOfAnyCharChar); + break; + + case 3: + Ldc(setChars[0]); + Ldc(setChars[1]); + Ldc(setChars[2]); + Call(s_spanIndexOfAnyCharCharChar); + break; + + default: + Ldstr(setChars.Slice(0, numSetChars).ToString()); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfSpan); + break; } Stloc(iLocal); @@ -4754,7 +4628,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4955,7 +4829,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -5105,21 +4979,34 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive) // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), // it's cheaper and smaller to compare against each than it is to use a lookup table. - if (!invariant) + if (!invariant && !RegexCharClass.IsNegated(charClass)) { - Span setChars = stackalloc char[3]; + Span setChars = stackalloc char[4]; int numChars = RegexCharClass.GetSetChars(charClass, setChars); - if (numChars > 0 && !RegexCharClass.IsNegated(charClass)) + if (numChars is 2 or 3) { - // (ch == setChars[0]) | (ch == setChars[1]) { | (ch == setChars[2]) } - Debug.Assert(numChars == 2 || numChars == 3); - Ldloc(tempLocal); - Ldc(setChars[0]); - Ceq(); - Ldloc(tempLocal); - Ldc(setChars[1]); - Ceq(); - Or(); + if ((setChars[0] | 0x20) == setChars[1]) // special-case common case of an upper and lowercase ASCII letter combination + { + // ((ch | 0x20) == setChars[1]) + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(setChars[1]); + Ceq(); + } + else + { + // (ch == setChars[0]) | (ch == setChars[1]) + Ldloc(tempLocal); + Ldc(setChars[0]); + Ceq(); + Ldloc(tempLocal); + Ldc(setChars[1]); + Ceq(); + Or(); + } + + // | (ch == setChars[2]) if (numChars == 3) { Ldloc(tempLocal); @@ -5130,6 +5017,27 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive) return; } + else if (numChars == 4 && + (setChars[0] | 0x20) == setChars[1] && + (setChars[2] | 0x20) == setChars[3]) + { + // ((ch | 0x20) == setChars[1]) + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(setChars[1]); + Ceq(); + + // ((ch | 0x20) == setChars[3]) + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(setChars[3]); + Ceq(); + + Or(); + return; + } } using RentedLocalBuilder resultLocal = RentInt32Local(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs new file mode 100644 index 0000000000000..f1b285818e93e --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -0,0 +1,664 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; + +namespace System.Text.RegularExpressions +{ + /// Contains state and provides operations related to finding the next location a match could possibly begin. + internal sealed class RegexFindOptimizations + { + /// The minimum required length an input need be to match the pattern. May be 0. + private readonly int _minRequiredLength; + /// True if the input should be processed right-to-left rather than left-to-right. + private readonly bool _rightToLeft; + /// Provides the ToLower routine for lowercasing characters. + private readonly TextInfo _textInfo; + /// Lookup table used for optimizing ASCII when doing set queries. + private readonly uint[]?[]? _asciiLookups; + + public RegexFindOptimizations(RegexTree tree, CultureInfo culture) + { + _rightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0; + _minRequiredLength = tree.MinRequiredLength; + _textInfo = culture.TextInfo; + + // Compute any anchor starting the expression. If there is one, we won't need to search for anything, + // as we can just match at that single location. + LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree); + if (_rightToLeft) + { + // Filter out Bol for RightToLeft, as we don't currently optimize for it. + LeadingAnchor &= ~RegexPrefixAnalyzer.Bol; + } + if ((LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0) + { + FindMode = (LeadingAnchor, _rightToLeft) switch + { + (RegexPrefixAnalyzer.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning, + (RegexPrefixAnalyzer.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning, + (RegexPrefixAnalyzer.Start, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start, + (RegexPrefixAnalyzer.Start, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start, + (RegexPrefixAnalyzer.End, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End, + (RegexPrefixAnalyzer.End, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End, + (_, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ, + (_, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ, + }; + return; + } + + // If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations. + string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree); + if (caseSensitivePrefix.Length > 1) + { + LeadingCaseSensitivePrefix = caseSensitivePrefix; + FindMode = _rightToLeft ? + FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive : + FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive; + return; + } + + // At this point there are no fast-searchable anchors or case-sensitive prefixes. We can now analyze the + // pattern for sets and then use any found sets to determine what kind of search to perform. + + // If we're compiling, then the compilation process already handles sets that reduce to a single literal, + // so we can simplify and just always go for the sets. + bool dfa = (tree.Options & RegexOptions.NonBacktracking) != 0; + bool compiled = (tree.Options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled + bool interpreter = !compiled && !dfa; + + // For interpreter, we want to employ optimizations, but we don't want to make construction significantly + // more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter + // we focus only on creating a set for the first character. Same for right-to-left, which is used very + // rarely and thus we don't need to invest in special-casing it. + if (_rightToLeft) + { + // Determine a set for anything that can possibly start the expression. + if (RegexPrefixAnalyzer.FindFirstCharClass(tree, culture) is (string CharClass, bool CaseInsensitive) set) + { + // See if the set is limited to holding only a few characters. + Span scratch = stackalloc char[5]; // max optimized by IndexOfAny today + int scratchCount; + char[]? chars = null; + if (!RegexCharClass.IsNegated(set.CharClass) && + (scratchCount = RegexCharClass.GetSetChars(set.CharClass, scratch)) > 0) + { + chars = scratch.Slice(0, scratchCount).ToArray(); + } + + if (!compiled && + chars is { Length: 1 }) + { + // The set contains one and only one character, meaning every match starts + // with the same literal value (potentially case-insensitive). Search for that. + FixedDistanceLiteral = (chars[0], 0); + FindMode = (_rightToLeft, set.CaseInsensitive) switch + { + (false, false) => FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive, + (false, true) => FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive, + (true, false) => FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive, + (true, true) => FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive, + }; + } + else + { + // The set may match multiple characters. Search for that. + FixedDistanceSets = new() { (chars, set.CharClass, 0, set.CaseInsensitive) }; + FindMode = (_rightToLeft, set.CaseInsensitive) switch + { + (false, false) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive, + (false, true) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive, + (true, false) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive, + (true, true) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive, + }; + _asciiLookups = new uint[1][]; + } + } + return; + } + + // We're now left-to-right only and looking for sets. + + // Build up a list of all of the sets that are a fixed distance from the start of the expression. + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(tree, culture, thorough: !interpreter); + if (fixedDistanceSets is not null) + { + Debug.Assert(fixedDistanceSets.Count != 0); + + // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines + // don't need to special-case literals as they already do codegen to create the optimal lookup based on + // the set's characteristics. + if (!compiled && + fixedDistanceSets.Count == 1 && + fixedDistanceSets[0].Chars is { Length: 1 }) + { + FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], fixedDistanceSets[0].Distance); + FindMode = fixedDistanceSets[0].CaseInsensitive ? + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive : + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive; + } + else + { + // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already + // sorted from best to worst, so just keep the first ones up to our limit. + const int MaxSetsToUse = 3; // arbitrary tuned limit + if (fixedDistanceSets.Count > MaxSetsToUse) + { + fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse); + } + + // Store the sets, and compute which mode to use. + FixedDistanceSets = fixedDistanceSets; + FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0, fixedDistanceSets[0].CaseInsensitive) switch + { + (true, true) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive, + (true, false) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive, + (false, true) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive, + (false, false) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive, + }; + _asciiLookups = new uint[fixedDistanceSets.Count][]; + } + return; + } + } + + /// Gets the selected mode for performing the next operation + public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch; + + /// Gets the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc). + public int LeadingAnchor { get; } + + /// Gets the leading prefix. May be an empty string. + public string LeadingCaseSensitivePrefix { get; } = string.Empty; + + /// When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern. + public (char Literal, int Distance) FixedDistanceLiteral { get; } + + /// When in fixed distance set mode, gets the set and how far it is from the start of the pattern. + /// The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not. + public List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FixedDistanceSets { get; } + + /// Try to advance to the next starting position that might be a location for a match. + /// The text to search. + /// The position in . This is updated with the found position. + /// The index in to consider the beginning for beginning anchor purposes. + /// The index in to consider the start for start anchor purposes. + /// The index in to consider the non-inclusive end of the string. + /// true if a position to attempt a match was found; false if none was found. + public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, int start, int end) + { + // Return early if we know there's not enough input left to match. + if (!_rightToLeft) + { + if (pos > end - _minRequiredLength) + { + pos = end; + return false; + } + } + else + { + if (pos - _minRequiredLength < beginning) + { + pos = beginning; + return false; + } + } + + // Optimize the handling of a Beginning-Of-Line (BOL) anchor (only for left-to-right). BOL is special, in that unlike + // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike + // the other anchors, which all skip all subsequent processing if found, with BOL we just use it + // to boost our position to the next line, and then continue normally with any searches. + if (LeadingAnchor == RegexPrefixAnalyzer.Bol) + { + // If we're not currently positioned at the beginning of a line (either + // the beginning of the string or just after a line feed), find the next + // newline and position just after it. + Debug.Assert(!_rightToLeft); + if (pos > beginning && text[pos - 1] != '\n') + { + int newline = text.IndexOf('\n', pos); + if (newline == -1 || newline + 1 > end) + { + pos = end; + return false; + } + + pos = newline + 1; + } + } + + switch (FindMode) + { + // There's an anchor. For some, we can simply compare against the current position. + // For others, we can jump to the relevant location. + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: + if (pos > beginning) + { + pos = end; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: + if (pos > start) + { + pos = end; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: + if (pos < end - 1) + { + pos = end - 1; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: + if (pos < end) + { + pos = end; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + if (pos > beginning) + { + pos = beginning; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + if (pos < start) + { + pos = beginning; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + if (pos < end - 1 || (pos == end - 1 && text[pos] != '\n')) + { + pos = beginning; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + if (pos < end) + { + pos = beginning; + return false; + } + return true; + + // There's a case-sensitive prefix. Search for it with ordinal IndexOf. + + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + { + int i = text.AsSpan(pos, end - pos).IndexOf(LeadingCaseSensitivePrefix.AsSpan()); + if (i >= 0) + { + pos += i; + return true; + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + { + int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(LeadingCaseSensitivePrefix.AsSpan()); + if (i >= 0) + { + pos = beginning + i + LeadingCaseSensitivePrefix.Length; + return true; + } + + pos = beginning; + return false; + } + + // There's a literal at the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive: + { + int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(FixedDistanceLiteral.Literal); + if (i >= 0) + { + pos = beginning + i + 1; + return true; + } + + pos = beginning; + return false; + } + + case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive: + { + char ch = FixedDistanceLiteral.Literal; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + for (int i = span.Length - 1; i >= 0; i--) + { + if (ti.ToLower(span[i]) == ch) + { + pos = beginning + i + 1; + return true; + } + } + + pos = beginning; + return false; + } + + // There's a set at the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + { + (char[]? chars, string set, _, _) = FixedDistanceSets![0]; + + ReadOnlySpan span = text.AsSpan(pos, end - pos); + if (chars is not null) + { + int i = span.IndexOfAny(chars); + if (i >= 0) + { + pos += i; + return true; + } + } + else + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + for (int i = 0; i < span.Length; i++) + { + if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) + { + pos += i; + return true; + } + } + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(pos, end - pos); + for (int i = 0; i < span.Length; i++) + { + if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup)) + { + pos += i; + return true; + } + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + + ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + for (int i = span.Length - 1; i >= 0; i--) + { + if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) + { + pos = beginning + i + 1; + return true; + } + } + + pos = beginning; + return false; + } + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + for (int i = span.Length - 1; i >= 0; i--) + { + if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup)) + { + pos = beginning + i + 1; + return true; + } + } + + pos = beginning; + return false; + } + + // There's a literal at a fixed offset from the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive: + { + Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); + + int i = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal); + if (i >= 0) + { + pos += i; + return true; + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive: + { + Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); + + char ch = FixedDistanceLiteral.Literal; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance); + for (int i = 0; i < span.Length; i++) + { + if (ti.ToLower(span[i]) == ch) + { + pos += i; + return true; + } + } + + pos = end; + return false; + } + + // There are one or more sets at fixed offsets from the start of the pattern. + + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + { + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!; + (char[]? primaryChars, string primarySet, int primaryDistance, _) = sets[0]; + int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength); + + if (primaryChars is not null) + { + for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) + { + int offset = inputPosition + primaryDistance; + int index = text.IndexOfAny(primaryChars, offset, end - offset); + if (index < 0) + { + break; + } + + inputPosition = index - primaryDistance; + if (inputPosition > endMinusRequiredLength) + { + break; + } + + for (int i = 1; i < sets.Count; i++) + { + (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; + char c = text[inputPosition + nextDistance]; + if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) + { + goto Bumpalong; + } + } + + pos = inputPosition; + return true; + + Bumpalong:; + } + } + else + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + + for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) + { + char c = text[inputPosition + primaryDistance]; + if (!RegexCharClass.CharInClass(c, primarySet, ref startingAsciiLookup)) + { + goto Bumpalong; + } + + for (int i = 1; i < sets.Count; i++) + { + (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; + c = text[inputPosition + nextDistance]; + if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) + { + goto Bumpalong; + } + } + + pos = inputPosition; + return true; + + Bumpalong:; + } + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + { + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!; + (_, string primarySet, int primaryDistance, _) = sets[0]; + + int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength); + TextInfo ti = _textInfo; + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + + for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) + { + char c = text[inputPosition + primaryDistance]; + if (!RegexCharClass.CharInClass(ti.ToLower(c), primarySet, ref startingAsciiLookup)) + { + goto Bumpalong; + } + + for (int i = 1; i < sets.Count; i++) + { + (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; + c = text[inputPosition + nextDistance]; + if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) + { + goto Bumpalong; + } + } + + pos = inputPosition; + return true; + + Bumpalong:; + } + + pos = end; + return false; + } + + // Nothing special to look for. Just return true indicating this is a valid position to try to match. + + default: + Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch); + return true; + } + } + } + + /// Mode to use for searching for the next location of a possible match. + internal enum FindNextStartingPositionMode + { + /// A "beginning" anchor at the beginning of the pattern. + LeadingAnchor_LeftToRight_Beginning, + /// A "start" anchor at the beginning of the pattern. + LeadingAnchor_LeftToRight_Start, + /// An "endz" anchor at the beginning of the pattern. This is rare. + LeadingAnchor_LeftToRight_EndZ, + /// An "end" anchor at the beginning of the pattern. This is rare. + LeadingAnchor_LeftToRight_End, + + /// A "beginning" anchor at the beginning of the right-to-left pattern. + LeadingAnchor_RightToLeft_Beginning, + /// A "start" anchor at the beginning of the right-to-left pattern. + LeadingAnchor_RightToLeft_Start, + /// An "endz" anchor at the beginning of the right-to-left pattern. This is rare. + LeadingAnchor_RightToLeft_EndZ, + /// An "end" anchor at the beginning of the right-to-left pattern. This is rare. + LeadingAnchor_RightToLeft_End, + + /// A case-sensitive multi-character substring at the beginning of the pattern. + LeadingPrefix_LeftToRight_CaseSensitive, + /// A case-sensitive multi-character substring at the beginning of the right-to-left pattern. + LeadingPrefix_RightToLeft_CaseSensitive, + + /// A case-sensitive set starting the pattern. + LeadingSet_LeftToRight_CaseSensitive, + /// A case-insensitive set starting the pattern. + LeadingSet_LeftToRight_CaseInsensitive, + /// A case-sensitive set starting the right-to-left pattern. + LeadingSet_RightToLeft_CaseSensitive, + /// A case-insensitive set starting the right-to-left pattern. + LeadingSet_RightToLeft_CaseInsensitive, + + /// A case-sensitive single character at a fixed distance from the start of the right-to-left pattern. + LeadingLiteral_RightToLeft_CaseSensitive, + /// A case-insensitive single character at a fixed distance from the start of the right-to-left pattern. + LeadingLiteral_RightToLeft_CaseInsensitive, + + /// A case-sensitive single character at a fixed distance from the start of the pattern. + FixedLiteral_LeftToRight_CaseSensitive, + /// A case-insensitive single character at a fixed distance from the start of the pattern. + FixedLiteral_LeftToRight_CaseInsensitive, + + /// One or more sets at a fixed distance from the start of the pattern. At least the first set is case-sensitive. + FixedSets_LeftToRight_CaseSensitive, + /// One or more sets at a fixed distance from the start of the pattern. At least the first set is case-insensitive. + FixedSets_LeftToRight_CaseInsensitive, + + /// Nothing to search for. Nop. + NoSearch, + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index c679c0101d6f2..4351473b96fdb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -15,7 +15,6 @@ internal sealed class RegexInterpreter : RegexRunner private readonly RegexCode _code; private readonly TextInfo _textInfo; - private readonly FindFirstCharMode _findFirstCharMode; private int _operator; private int _codepos; @@ -29,48 +28,6 @@ public RegexInterpreter(RegexCode code, CultureInfo culture) _code = code; _textInfo = culture.TextInfo; - - // Determine what searching mode FindFirstChar will employ. - if ((_code.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0) - { - _findFirstCharMode = (_code.LeadingAnchor, code.RightToLeft) switch - { - (RegexPrefixAnalyzer.Beginning, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_Beginning, - (RegexPrefixAnalyzer.Beginning, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_Beginning, - (RegexPrefixAnalyzer.Start, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_Start, - (RegexPrefixAnalyzer.Start, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_Start, - (RegexPrefixAnalyzer.End, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_End, - (RegexPrefixAnalyzer.End, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_End, - (_, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_EndZ, - (_, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ, - }; - } - else if (code.BoyerMoorePrefix is RegexBoyerMoore rbm) - { - _findFirstCharMode = rbm.PatternSupportsIndexOf ? - FindFirstCharMode.IndexOf : - FindFirstCharMode.BoyerMoore; - } - else if (code.LeadingCharClasses is not null) - { - (string charClass, bool caseInsensitive) = code.LeadingCharClasses[0]; - bool isSet = !RegexCharClass.IsSingleton(charClass); - _findFirstCharMode = (code.RightToLeft, caseInsensitive, isSet) switch - { - (false, false, false) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Singleton, - (false, false, true) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Set, - (false, true, false) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Singleton, - (false, true, true) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Set, - (true, false, false) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Singleton, - (true, false, true) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Set, - (true, true, false) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Singleton, - (true, true, true) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Set, - }; - } - else - { - _findFirstCharMode = FindFirstCharMode.NoSearch; - } } protected override void InitTrackCount() => runtrackcount = _code.TrackCount; @@ -372,306 +329,8 @@ private bool MatchRef(int index, int length) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - private enum FindFirstCharMode - { - LeadingAnchor_LeftToRight_Beginning, - LeadingAnchor_LeftToRight_Start, - LeadingAnchor_LeftToRight_EndZ, - LeadingAnchor_LeftToRight_End, - - LeadingAnchor_RightToLeft_Beginning, - LeadingAnchor_RightToLeft_Start, - LeadingAnchor_RightToLeft_EndZ, - LeadingAnchor_RightToLeft_End, - - IndexOf, - BoyerMoore, - - LeadingCharClass_LeftToRight_CaseSensitive_Singleton, - LeadingCharClass_LeftToRight_CaseSensitive_Set, - LeadingCharClass_LeftToRight_CaseInsensitive_Singleton, - LeadingCharClass_LeftToRight_CaseInsensitive_Set, - - LeadingCharClass_RightToLeft_CaseSensitive_Singleton, - LeadingCharClass_RightToLeft_CaseSensitive_Set, - LeadingCharClass_RightToLeft_CaseInsensitive_Singleton, - LeadingCharClass_RightToLeft_CaseInsensitive_Set, - - NoSearch, - } - - protected override bool FindFirstChar() - { - // Return early if we know there's not enough input left to match. - if (!_code.RightToLeft) - { - if (runtextpos > runtextend - _code.Tree.MinRequiredLength) - { - runtextpos = runtextend; - return false; - } - } - else - { - if (runtextpos - _code.Tree.MinRequiredLength < runtextbeg) - { - runtextpos = runtextbeg; - return false; - } - } - - // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike - // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike - // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any Boyer-Moore or - // leading char class searches. - if (_code.LeadingAnchor == RegexPrefixAnalyzer.Bol && - !_code.RightToLeft) // don't bother customizing this optimization for the very niche RTL + Multiline case - { - // If we're not currently positioned at the beginning of a line (either - // the beginning of the string or just after a line feed), find the next - // newline and position just after it. - if (runtextpos > runtextbeg && runtext![runtextpos - 1] != '\n') - { - int newline = runtext.IndexOf('\n', runtextpos); - if (newline == -1 || newline + 1 > runtextend) - { - runtextpos = runtextend; - return false; - } - - runtextpos = newline + 1; - } - } - - switch (_findFirstCharMode) - { - // If the pattern is anchored, we can update our position appropriately and return immediately. - // If there's a Boyer-Moore prefix, we can also validate it. - - case FindFirstCharMode.LeadingAnchor_LeftToRight_Beginning: - if (runtextpos > runtextbeg) - { - runtextpos = runtextend; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_LeftToRight_Start: - if (runtextpos > runtextstart) - { - runtextpos = runtextend; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_LeftToRight_EndZ: - if (runtextpos < runtextend - 1) - { - runtextpos = runtextend - 1; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_LeftToRight_End: - if (runtextpos < runtextend) - { - runtextpos = runtextend; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_Beginning: - if (runtextpos > runtextbeg) - { - runtextpos = runtextbeg; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_Start: - if (runtextpos < runtextstart) - { - runtextpos = runtextbeg; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ: - if (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && runtext![runtextpos] != '\n')) - { - runtextpos = runtextbeg; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_End: - if (runtextpos < runtextend) - { - runtextpos = runtextbeg; - return false; - } - return NoPrefixOrPrefixMatches(); - - // There was a prefix. Scan for it. - - case FindFirstCharMode.IndexOf: - { - int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOf(_code.BoyerMoorePrefix!.Pattern); - if (i >= 0) - { - runtextpos += i; - return true; - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.BoyerMoore: - runtextpos = _code.BoyerMoorePrefix!.Scan(runtext!, runtextpos, runtextbeg, runtextend); - if (runtextpos >= 0) - { - return true; - } - runtextpos = _code.RightToLeft ? runtextbeg : runtextend; - return false; - - // There's a leading character class. Search for it. - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Singleton: - { - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - int i = span.IndexOf(RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass)); - if (i >= 0) - { - runtextpos += i; - return true; - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - for (int i = 0; i < span.Length; i++) - { - if (RegexCharClass.CharInClass(span[i], set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos += i; - return true; - } - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Singleton: - { - char ch = RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass); - TextInfo ti = _textInfo; - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - for (int i = 0; i < span.Length; i++) - { - if (ch == ti.ToLower(span[i])) - { - runtextpos += i; - return true; - } - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - TextInfo ti = _textInfo; - for (int i = 0; i < span.Length; i++) - { - if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos += i; - return true; - } - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Singleton: - { - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - int i = span.LastIndexOf(RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass)); - if (i >= 0) - { - runtextpos = runtextbeg + i + 1; - return true; - } - runtextpos = runtextbeg; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - for (int i = span.Length - 1; i >= 0; i--) - { - if (RegexCharClass.CharInClass(span[i], set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos = runtextbeg + i + 1; - return true; - } - } - runtextpos = runtextbeg; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Singleton: - { - char ch = RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass); - TextInfo ti = _textInfo; - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - for (int i = span.Length - 1; i >= 0; i--) - { - if (ch == ti.ToLower(span[i])) - { - runtextpos = runtextbeg + i + 1; - return true; - } - } - runtextpos = runtextbeg; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - TextInfo ti = _textInfo; - for (int i = span.Length - 1; i >= 0; i--) - { - if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos = runtextbeg + i + 1; - return true; - } - } - runtextpos = runtextbeg; - return false; - } - - // Nothing special to look for. Just return true indicating this is a valid position to try to match. - - default: - Debug.Assert(_findFirstCharMode == FindFirstCharMode.NoSearch); - return true; - } - - bool NoPrefixOrPrefixMatches() => - _code.BoyerMoorePrefix is not RegexBoyerMoore rbm || - rbm.IsMatch(runtext!, runtextpos, runtextbeg, runtextend); - } + protected override bool FindFirstChar() => + _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend); protected override void Go() { @@ -1230,7 +889,7 @@ protected override void Go() int operand0 = Operand(0); string set = _code.Strings[operand0]; - ref int[]? setLookup = ref _code.StringsAsciiLookup[operand0]; + ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0]; while (c-- > 0) { @@ -1322,7 +981,7 @@ protected override void Go() int len = Math.Min(Operand(1), Forwardchars()); int operand0 = Operand(0); string set = _code.Strings[operand0]; - ref int[]? setLookup = ref _code.StringsAsciiLookup[operand0]; + ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0]; int i; for (i = len; i > 0; i--) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 8ed30bbcb266b..53b78c5d32479 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -39,9 +39,6 @@ public RegexRunnerFactory FactoryInstanceFromCode(string pattern, RegexCode code _code = code; _codes = code.Codes; _strings = code.Strings; - _leadingCharClasses = code.LeadingCharClasses; - _boyerMoorePrefix = code.BoyerMoorePrefix; - _leadingAnchor = code.LeadingAnchor; _trackcount = code.TrackCount; _options = options; _hasTimeout = hasTimeout; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index c23bb50720a05..e7b0e71076df1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -41,6 +41,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Globalization; using System.Threading; namespace System.Text.RegularExpressions @@ -151,11 +152,49 @@ public RegexNode(int type, RegexOptions options, int m, int n) N = n; } - public bool UseOptionR() => (Options & RegexOptions.RightToLeft) != 0; + /// Creates a RegexNode representing a single character. + /// The character. + /// The node's options. + /// The culture to use to perform any required transformations. + /// The created RegexNode. This might be a RegexNode.One or a RegexNode.Set. + public static RegexNode CreateOneWithCaseConversion(char ch, RegexOptions options, CultureInfo? culture) + { + // If the options specify case-insensitivity, we try to create a node that fully encapsulates that. + if ((options & RegexOptions.IgnoreCase) != 0) + { + Debug.Assert(culture is not null); + + // If the character is part of a Unicode category that doesn't participate in case conversion, + // we can simply strip out the IgnoreCase option and make the node case-sensitive. + if (!RegexCharClass.ParticipatesInCaseConversion(ch)) + { + return new RegexNode(One, options & ~RegexOptions.IgnoreCase, ch); + } + + // Create a set for the character, trying to include all case-insensitive equivalent characters. + // If it's successful in doing so, resultIsCaseInsensitive will be false and we can strip + // out RegexOptions.IgnoreCase as part of creating the set. + string stringSet = RegexCharClass.OneToStringClass(ch, culture, out bool resultIsCaseInsensitive); + if (!resultIsCaseInsensitive) + { + return new RegexNode(Set, options & ~RegexOptions.IgnoreCase, stringSet); + } + + // Otherwise, until we can get rid of ToLower usage at match time entirely (https://github.com/dotnet/runtime/issues/61048), + // lowercase the character and proceed to create an IgnoreCase One node. + ch = culture.TextInfo.ToLower(ch); + } + + // Create a One node for the character. + return new RegexNode(One, options, ch); + } - public RegexNode ReverseLeft() + /// Reverses all children of a concatenation when in RightToLeft mode. + public RegexNode ReverseConcatenationIfRightToLeft() { - if (UseOptionR() && Type == Concatenate && ChildCount() > 1) + if ((Options & RegexOptions.RightToLeft) != 0 && + Type == Concatenate && + ChildCount() > 1) { ((List)Children!).Reverse(); } @@ -203,13 +242,26 @@ private void ValidateFinalTreeInvariants() { RegexNode node = toExamine.Pop(); + // Add all children to be examined + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + RegexNode child = node.Child(i); + Debug.Assert(child.Next == node, $"{child.Description()} missing reference to parent {node.Description()}"); + + toExamine.Push(child); + } + // Validate that we never see certain node types. Debug.Assert(Type != Group, "All Group nodes should have been removed."); - // Validate expected child counts. - int childCount = node.ChildCount(); + // Validate node types and expected child counts. switch (node.Type) { + case Group: + Debug.Fail("All Group nodes should have been removed."); + break; + case Beginning: case Bol: case Boundary: @@ -247,25 +299,20 @@ private void ValidateFinalTreeInvariants() case Prevent: case Require: Debug.Assert(childCount == 1, $"Expected one and only one child for {node.TypeName}, got {childCount}."); - toExamine.Push(node.Child(0)); break; case Testref: case Testgroup: Debug.Assert(childCount >= 1, $"Expected at least one child for {node.TypeName}, got {childCount}."); - for (int i = 0; i < childCount; i++) - { - toExamine.Push(node.Child(i)); - } break; case Concatenate: case Alternate: Debug.Assert(childCount >= 2, $"Expected at least two children for {node.TypeName}, got {childCount}."); - for (int i = 0; i < childCount; i++) - { - toExamine.Push(node.Child(i)); - } + break; + + default: + Debug.Fail($"Unexpected node type: {node.Type}"); break; } @@ -273,6 +320,10 @@ private void ValidateFinalTreeInvariants() switch (node.Type) { case Multi: + Debug.Assert(node.Str is not null, "Expect non-null multi string"); + Debug.Assert(node.Str.Length >= 2, $"Expected {node.Str} to be at least two characters"); + break; + case Set: case Setloop: case Setloopatomic: @@ -881,8 +932,10 @@ private RegexNode ReduceAlternation() default: ReduceSingleLetterAndNestedAlternations(); - RegexNode newThis = ReplaceNodeIfUnnecessary(Nothing); - return newThis != this ? newThis : ExtractCommonPrefixes(); + RegexNode node = ReplaceNodeIfUnnecessary(Nothing); + node = ExtractCommonPrefixText(node); + node = ExtractCommonPrefixOneNotoneSet(node); + return node; } // This function performs two optimizations: @@ -952,7 +1005,6 @@ void ReduceSingleLetterAndNestedAlternations() break; } - // The last node was a Set or a One, we're a Set or One and our options are the same. // Merge the two nodes. j--; @@ -981,6 +1033,12 @@ void ReduceSingleLetterAndNestedAlternations() prev.Type = Set; prev.Str = prevCharClass.ToStringClass(Options); + if ((prev.Options & RegexOptions.IgnoreCase) != 0 && + RegexCharClass.MakeCaseSensitiveIfPossible(prev.Str, RegexParser.GetTargetCulture(prev.Options)) is string newSetString) + { + prev.Str = newSetString; + prev.Options &= ~RegexOptions.IgnoreCase; + } } else if (at.Type == Nothing) { @@ -1001,6 +1059,106 @@ void ReduceSingleLetterAndNestedAlternations() } } + // This function optimizes out prefix nodes from alternation branches that are + // the same across multiple contiguous branches. + // e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90) + static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation) + { + if (alternation.Type != Alternate) + { + return alternation; + } + + Debug.Assert(alternation.Children is List { Count: >= 2 }); + var children = (List)alternation.Children; + + // Only process left-to-right prefixes. + if ((alternation.Options & RegexOptions.RightToLeft) != 0) + { + return alternation; + } + + // Only handle the case where each branch is a concatenation + foreach (RegexNode child in children) + { + if (child.Type != Concatenate || child.ChildCount() < 2) + { + return alternation; + } + } + + for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++) + { + Debug.Assert(children[startingIndex].Children is List { Count: >= 2 }); + + // Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop). + // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing + // it for non-atomic variable length loops could change behavior as each branch could otherwise have a + // different number of characters consumed by the loop based on what's after it. + RegexNode required = children[startingIndex].Child(0); + switch (required.Type) + { + case One or Notone or Set: + case Oneloopatomic or Notoneloopatomic or Setloopatomic: + case Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy when required.M == required.N: + break; + + default: + continue; + } + + // Only handle the case where each branch begins with the exact same node value + int endingIndex = startingIndex + 1; + for (; endingIndex < children.Count; endingIndex++) + { + RegexNode other = children[endingIndex].Child(0); + if (required.Type != other.Type || + required.Options != other.Options || + required.M != other.M || + required.N != other.N || + required.Ch != other.Ch || + required.Str != other.Str) + { + break; + } + } + + if (endingIndex - startingIndex <= 1) + { + // Nothing to extract from this starting index. + continue; + } + + // Remove the prefix node from every branch, adding it to a new alternation + var newAlternate = new RegexNode(Alternate, alternation.Options); + for (int i = startingIndex; i < endingIndex; i++) + { + ((List)children[i].Children!).RemoveAt(0); + newAlternate.AddChild(children[i]); + } + + // If this alternation is wrapped as atomic, we need to do the same for the new alternation. + if (alternation.Next is RegexNode parent && parent.Type == Atomic) + { + var atomic = new RegexNode(Atomic, alternation.Options); + atomic.AddChild(newAlternate); + newAlternate = atomic; + } + + // Now create a concatenation of the prefix node with the new alternation for the combined + // branches, and replace all of the branches in this alternation with that new concatenation. + var newConcat = new RegexNode(Concatenate, alternation.Options); + newConcat.AddChild(required); + newConcat.AddChild(newAlternate); + alternation.ReplaceChild(startingIndex, newConcat); + children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1); + } + + // If we've reduced this alternation to just a single branch, return it. + // Otherwise, return the alternation. + return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation; + } + // Analyzes all the branches of the alternation for text that's identical at the beginning // of every branch. That text is then pulled out into its own one or multi node in a // concatenation with the alternation (whose branches are updated to remove that prefix). @@ -1010,22 +1168,25 @@ void ReduceSingleLetterAndNestedAlternations() // by sets that can be merged. Third, it reduces the amount of duplicated comparisons required // if we end up backtracking into subsequent branches. // e.g. abc|ade => a(?bc|de) - RegexNode ExtractCommonPrefixes() + static RegexNode ExtractCommonPrefixText(RegexNode alternation) { + if (alternation.Type != Alternate) + { + return alternation; + } + + Debug.Assert(alternation.Children is List { Count: >= 2 }); + var children = (List)alternation.Children; + // To keep things relatively simple, we currently only handle: // - Left to right (e.g. we don't process alternations in lookbehinds) // - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes. // - All branches having the same options. - // - Text, rather than also trying to combine identical sets that start each branch. - - Debug.Assert(Children is List); - var children = (List)Children; - Debug.Assert(children.Count >= 2); // Only extract left-to-right prefixes. - if ((Options & RegexOptions.RightToLeft) != 0) + if ((alternation.Options & RegexOptions.RightToLeft) != 0) { - return this; + return alternation; } Span scratchChar = stackalloc char[1]; @@ -1036,7 +1197,7 @@ RegexNode ExtractCommonPrefixes() RegexNode? startingNode = children[startingIndex].FindBranchOneOrMultiStart(); if (startingNode is null) { - return this; + return alternation; } RegexOptions startingNodeOptions = startingNode.Options; @@ -1159,7 +1320,7 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) } } - if (Next is RegexNode parent && parent.Type == Atomic) + if (alternation.Next is RegexNode parent && parent.Type == Atomic) { var atomic = new RegexNode(Atomic, startingNodeOptions); atomic.AddChild(newAlternate); @@ -1169,11 +1330,11 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) var newConcat = new RegexNode(Concatenate, startingNodeOptions); newConcat.AddChild(prefix); newConcat.AddChild(newAlternate); - ReplaceChild(startingIndex, newConcat); + alternation.ReplaceChild(startingIndex, newConcat); children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1); } - return ChildCount() == 1 ? Child(0) : this; + return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 1f4a05afa47c1..0bda8a2367ed6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -81,6 +81,10 @@ private RegexParser(string pattern, RegexOptions options, CultureInfo culture, S { } + /// Gets the culture to use based on the specified options. + internal static CultureInfo GetTargetCulture(RegexOptions options) => + (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture) { var parser = new RegexParser(pattern, options, culture, stackalloc int[OptionStackDefaultSize]); @@ -319,7 +323,12 @@ private RegexNode ScanRegex() goto ContinueOuterScan; case '[': - AddUnitSet(ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options)); + { + string setString = ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options); + _unit = UseOptionI() && RegexCharClass.MakeCaseSensitiveIfPossible(setString, _culture) is string newSetString ? + new RegexNode(RegexNode.Set, _options & ~RegexOptions.IgnoreCase, newSetString) : + new RegexNode(RegexNode.Set, _options, setString); + } break; case '(': @@ -378,14 +387,9 @@ private RegexNode ScanRegex() break; case '.': - if (UseOptionS()) - { - AddUnitSet(RegexCharClass.AnyClass); - } - else - { - AddUnitNotone('\n'); - } + _unit = UseOptionS() ? + new RegexNode(RegexNode.Set, _options & ~RegexOptions.IgnoreCase, RegexCharClass.AnyClass) : + new RegexNode(RegexNode.Notone, _options & ~RegexOptions.IgnoreCase, '\n'); break; case '{': @@ -734,21 +738,17 @@ node.M is not (0 or RegexReplacement.LeftPortion or RegexReplacement.RightPortio { // we aren't in a range, and now there is a subtraction. Usually this happens // only when a subtraction follows a range, like [a-z-[b]] + MoveRight(); + RegexCharClass? rcc = ScanCharClass(caseInsensitive, scanOnly); if (!scanOnly) { - MoveRight(1); - charClass!.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly)!); + charClass!.AddSubtraction(rcc!); if (CharsRight() > 0 && RightChar() != ']') { throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast); } } - else - { - MoveRight(1); - ScanCharClass(caseInsensitive, scanOnly); - } } else { @@ -1173,32 +1173,32 @@ private void ScanBlank() case 'w': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass); case 'W': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass); case 's': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass); case 'S': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass); case 'd': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass); case 'D': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass); case 'p': case 'P': @@ -1220,6 +1220,22 @@ private void ScanBlank() default: return ScanBasicBackslash(scanOnly); } + + static RegexOptions RemoveIgnoreCaseIfNotEcma(RegexOptions options) + { + // This function is used for \w, \W, \d, \D, \s, and \S to remove IgnoreCase, + // since they already include the notion of casing in their definitions. + // However, for compatibility, if ECMAScript is specified, we avoid stripping + // out the IgnoreCase. We should revisit this as part of https://github.com/dotnet/runtime/issues/61048, + // as it seems wrong that specifying ECMAScript (which implies non-Unicode) would + // then still involve lowercasing potentially Unicode character inputs to match + // against these sets. + if ((options & RegexOptions.ECMAScript) == 0) + { + options &= ~RegexOptions.IgnoreCase; + } + return options; + } } /// Scans \-style backreferences and character escapes @@ -1354,12 +1370,9 @@ private void ScanBlank() Textto(backpos); ch = ScanCharEscape(); - if (UseOptionI()) - { - ch = _culture.TextInfo.ToLower(ch); - } - - return scanOnly ? null : new RegexNode(RegexNode.One, _options, ch); + return !scanOnly ? + RegexNode.CreateOneWithCaseConversion(ch, _options, _culture) : + null; } /* @@ -1369,7 +1382,7 @@ private RegexNode ScanDollar() { if (CharsRight() == 0) { - return new RegexNode(RegexNode.One, _options, '$'); + return RegexNode.CreateOneWithCaseConversion('$', _options, _culture); } char ch = RightChar(); @@ -1469,7 +1482,7 @@ private RegexNode ScanDollar() { case '$': MoveRight(); - return new RegexNode(RegexNode.One, _options, '$'); + return RegexNode.CreateOneWithCaseConversion('$', _options, _culture); case '&': capnum = 0; @@ -1502,7 +1515,7 @@ private RegexNode ScanDollar() // unrecognized $: literalize Textto(backpos); - return new RegexNode(RegexNode.One, _options, '$'); + return RegexNode.CreateOneWithCaseConversion('$', _options, _culture); } /// Throws on unsupported capture references for NonBacktracking in replacement patterns. @@ -2149,50 +2162,26 @@ private bool IsTrueQuantifier() /// Add a string to the last concatenate. private void AddConcatenate(int pos, int cch, bool isReplacement) { - if (cch == 0) + switch (cch) { - return; - } + case 0: + return; - RegexNode node; - if (cch > 1) - { - string str = UseOptionI() && !isReplacement ? -#if REGEXGENERATOR - StringExtensions.Create -#else - string.Create -#endif - (cch, (_pattern, _culture, pos, cch), static (dest, state) => - { - // We do the ToLower character-by character for consistency with the rest of the implementation. - // With surrogate pairs, doing a ToLower on the entire string is more correct linguistically, but - // Regex doesn't support surrogates, and not doing this character-by-character then causes differences - // from matching where characters are lowercased individually. - ReadOnlySpan src = state._pattern.AsSpan(state.pos, state.cch); - TextInfo ti = state._culture.TextInfo; - for (int i = 0; i < dest.Length; i++) - { - dest[i] = ti.ToLower(src[i]); - } - }) : - _pattern.Substring(pos, cch); - - node = new RegexNode(RegexNode.Multi, _options, str); - } - else - { - char ch = _pattern[pos]; + case 1: + _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture)); + break; - if (UseOptionI() && !isReplacement) - { - ch = _culture.TextInfo.ToLower(ch); - } + case > 1 when !UseOptionI() || isReplacement: + _concatenation!.AddChild(new RegexNode(RegexNode.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch))); + break; - node = new RegexNode(RegexNode.One, _options, ch); + default: + foreach (char c in _pattern.AsSpan(pos, cch)) + { + _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(c, _options, _culture)); + } + break; } - - _concatenation!.AddChild(node); } /// Push the parser state (in response to an open paren) @@ -2243,11 +2232,11 @@ private void AddAlternate() if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref) { - _group.AddChild(_concatenation!.ReverseLeft()); + _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); } else { - _alternation!.AddChild(_concatenation!.ReverseLeft()); + _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); } _concatenation = new RegexNode(RegexNode.Concatenate, _options); @@ -2273,29 +2262,7 @@ private void AddConcatenate(bool lazy, int min, int max) private RegexNode? Unit() => _unit; /// Sets the current unit to a single char node - private void AddUnitOne(char ch) - { - if (UseOptionI()) - { - ch = _culture.TextInfo.ToLower(ch); - } - - _unit = new RegexNode(RegexNode.One, _options, ch); - } - - /// Sets the current unit to a single inverse-char node - private void AddUnitNotone(char ch) - { - if (UseOptionI()) - { - ch = _culture.TextInfo.ToLower(ch); - } - - _unit = new RegexNode(RegexNode.Notone, _options, ch); - } - - /// Sets the current unit to a single set node - private void AddUnitSet(string cc) => _unit = new RegexNode(RegexNode.Set, _options, cc); + private void AddUnitOne(char ch) => _unit = RegexNode.CreateOneWithCaseConversion(ch, _options, _culture); /// Sets the current unit to a subtree private void AddUnitNode(RegexNode node) => _unit = node; @@ -2308,7 +2275,7 @@ private void AddGroup() { if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref) { - _group.AddChild(_concatenation!.ReverseLeft()); + _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); if (_group.Type == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3) { @@ -2317,7 +2284,7 @@ private void AddGroup() } else { - _alternation!.AddChild(_concatenation!.ReverseLeft()); + _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); _group.AddChild(_alternation); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 96a709b2338d4..22a2abba1946a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -5,6 +5,8 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; +using System.Runtime.CompilerServices; +using System.Threading; namespace System.Text.RegularExpressions { @@ -40,256 +42,518 @@ private RegexPrefixAnalyzer(Span intStack) _skipAllChildren = false; } - /// Computes the leading substring in . - /// It's quite trivial and gives up easily, in which case an empty string is returned. - public static (string Prefix, bool CaseInsensitive) ComputeLeadingSubstring(RegexTree tree) + /// Computes the leading substring in ; may be empty. + public static string FindCaseSensitivePrefix(RegexTree tree) { - RegexNode curNode = tree.Root; - RegexNode? concatNode = null; - int nextChild = 0; + var vsb = new ValueStringBuilder(stackalloc char[64]); + Process(tree.Root, ref vsb); + return vsb.ToString(); - while (true) + // Processes the node, adding any prefix text to the builder. + // Returns whether processing should continue with subsequent nodes. + static bool Process(RegexNode node, ref ValueStringBuilder vsb) { - switch (curNode.Type) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { + // If we're too deep on the stack, just give up finding any more prefix. + return false; + } + + // We don't bother to handle reversed input, so process at most one node + // when handling RightToLeft. + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + + switch (node.Type) + { + // Concatenation case RegexNode.Concatenate: - if (curNode.ChildCount() > 0) { - concatNode = curNode; - nextChild = 0; + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (!Process(node.Child(i), ref vsb)) + { + return false; + } + } + return !rtl; } - break; - case RegexNode.Atomic: - case RegexNode.Capture: - curNode = curNode.Child(0); - concatNode = null; - continue; + // Alternation: find a string that's a shared prefix of all branches + case RegexNode.Alternate: + { + int childCount = node.ChildCount(); - case RegexNode.Oneloop: - case RegexNode.Oneloopatomic: - case RegexNode.Onelazy: + // Store the initial branch into the target builder + int initialLength = vsb.Length; + bool keepExploring = Process(node.Child(0), ref vsb); + int addedLength = vsb.Length - initialLength; - // In release, cutoff at a length to which we can still reasonably construct a string and Boyer-Moore search. - // In debug, use a smaller cutoff to exercise the cutoff path in tests - const int Cutoff = -#if DEBUG - 50; -#else - RegexBoyerMoore.MaxLimit; -#endif + // Then explore the rest of the branches, finding the length + // a prefix they all share in common with the initial branch. + if (addedLength != 0) + { + var alternateSb = new ValueStringBuilder(64); - if (curNode.M > 0 && curNode.M < Cutoff) - { - return (new string(curNode.Ch, curNode.M), (curNode.Options & RegexOptions.IgnoreCase) != 0); - } + // Process each branch. If we reach a point where we've proven there's + // no overlap, we can bail early. + for (int i = 1; i < childCount && addedLength != 0; i++) + { + alternateSb.Length = 0; + + // Process the branch. We want to keep exploring after this alternation, + // but we can't if either this branch doesn't allow for it or if the prefix + // supplied by this branch doesn't entirely match all the previous ones. + keepExploring &= Process(node.Child(i), ref alternateSb); + keepExploring &= alternateSb.Length == addedLength; + + addedLength = Math.Min(addedLength, alternateSb.Length); + for (int j = 0; j < addedLength; j++) + { + if (vsb[initialLength + j] != alternateSb[j]) + { + addedLength = j; + keepExploring = false; + break; + } + } + } - return (string.Empty, false); + alternateSb.Dispose(); - case RegexNode.One: - return (curNode.Ch.ToString(), (curNode.Options & RegexOptions.IgnoreCase) != 0); + // Then cull back on what was added based on the other branches. + vsb.Length = initialLength + addedLength; + } - case RegexNode.Multi: - return (curNode.Str!, (curNode.Options & RegexOptions.IgnoreCase) != 0); + return !rtl && keepExploring; + } + // One character + case RegexNode.One when (node.Options & RegexOptions.IgnoreCase) == 0: + vsb.Append(node.Ch); + return !rtl; + + // Multiple characters + case RegexNode.Multi when (node.Options & RegexOptions.IgnoreCase) == 0: + vsb.Append(node.Str); + return !rtl; + + // Loop of one character + case RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy when node.M > 0 && (node.Options & RegexOptions.IgnoreCase) == 0: + const int SingleCharIterationLimit = 32; // arbitrary cut-off to avoid creating super long strings unnecessarily + int count = Math.Min(node.M, SingleCharIterationLimit); + vsb.Append(node.Ch, count); + return count == node.N && !rtl; + + // Loop of a node + case RegexNode.Loop or RegexNode.Lazyloop when node.M > 0: + { + const int NodeIterationLimit = 4; // arbitrary cut-off to avoid creating super long strings unnecessarily + int limit = Math.Min(node.M, NodeIterationLimit); + for (int i = 0; i < limit; i++) + { + if (!Process(node.Child(0), ref vsb)) + { + return false; + } + } + return limit == node.N && !rtl; + } + + // Grouping nodes for which we only care about their single child + case RegexNode.Atomic: + case RegexNode.Capture: + return Process(node.Child(0), ref vsb); + + // Zero-width anchors and assertions case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: + case RegexNode.NonBoundary: + case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: + case RegexNode.UpdateBumpalong: case RegexNode.Require: case RegexNode.Prevent: - break; + return true; + // Give up for anything else default: - return (string.Empty, false); + return false; } - - if (concatNode == null || nextChild >= concatNode.ChildCount()) - { - return (string.Empty, false); - } - - curNode = concatNode.Child(nextChild++); } } - /// Computes a character class for the first character in . - /// true if a character class could be computed; otherwise, false. - public static (string CharClass, bool CaseInsensitive)[]? ComputeFirstCharClass(RegexTree tree) + /// Finds sets at fixed-offsets from the beginning of the pattern/ + /// The RegexNode tree. + /// The culture to use for any case conversions. + /// true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete. + /// The array of found sets, or null if there aren't any. + public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets( + RegexTree tree, CultureInfo culture, bool thorough) { - var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]); - RegexFC? fc = s.RegexFCFromRegexTree(tree); - s.Dispose(); + const int MaxLoopExpansion = 20; // arbitrary cut-off to avoid loops adding significant overhead to processing + const int MaxFixedResults = 50; // arbitrary cut-off to avoid generating lots of sets unnecessarily - if (fc == null || fc._nullable) + // Find all fixed-distance sets. + var results = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>(); + int distance = 0; + TryFindFixedSets(tree.Root, results, ref distance, culture, thorough); +#if DEBUG + foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) result in results) { - return null; + Debug.Assert(result.Distance <= tree.MinRequiredLength, $"Min: {tree.MinRequiredLength}, Distance: {result.Distance}, Tree: {tree}"); } +#endif - if (fc.CaseInsensitive) + // Remove any sets that match everything; they're not helpful. (This check exists primarily to weed + // out use of . in Singleline mode.) + bool hasAny = false; + for (int i = 0; i < results.Count; i++) { - fc.AddLowercase(((tree.Options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); + if (results[i].Set == RegexCharClass.AnyClass) + { + hasAny = true; + break; + } } - - return new[] { (fc.GetFirstChars(), fc.CaseInsensitive) }; - } - - /// Computes character classes for the first characters in . - /// - /// For example, given "hello|world" and a of 3, this will compute the sets [hw], [eo], and [lr]. - /// As with some of the other computations, it's quite trivial and gives up easily; for example, we could in - /// theory handle nodes in a concatenation after an alternation, but we look only at the branches of the - /// alternation itself. As this computation is intended primarily to handle global alternations, it's currently - /// a reasonable tradeoff between simplicity, performance, and the fullness of potential optimizations. - /// - public static (string CharClass, bool CaseInsensitive)[]? ComputeMultipleCharClasses(RegexTree tree, int maxChars) - { - Debug.Assert(maxChars > 1); - - if ((tree.Options & RegexOptions.RightToLeft) != 0) + if (hasAny) { - // We don't bother for RightToLeft. It's rare and adds non-trivial complication. - return null; + results.RemoveAll(s => s.Set == RegexCharClass.AnyClass); } - // The known minimum required length will have already factored in knowledge about alternations. - // If the known min length is less than the maximum number of chars requested, we can - // cut this short. If it's zero, there's nothing to be found. If it's one, we won't do - // any better than ComputeFirstCharClass (and likely worse). Otherwise, don't bother looking for more - // the min of the min length and the max requested chars. - maxChars = Math.Min(tree.MinRequiredLength, maxChars); - if (maxChars <= 1) + // If we don't have any results, try harder to compute one for the starting character. + // This is a more involved computation that can find things the fixed-distance investigation + // doesn't. + if (results.Count == 0) { - return null; + (string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(tree, culture); + if (first is not null) + { + results.Add((null, first.Value.CharClass, 0, first.Value.CaseInsensitive)); + } + + if (results.Count == 0) + { + return null; + } } - // Find an alternation on the path to the first node. If we can't, bail. - RegexNode node = tree.Root; - while (node.Type != RegexNode.Alternate) + // For every entry, see if we can mark any that are case-insensitive as actually being case-sensitive + // based on not participating in case conversion. And then for ones that are case-sensitive, try to + // get the chars that make up the set, if there are few enough. + Span scratch = stackalloc char[5]; // max optimized by IndexOfAny today + for (int i = 0; i < results.Count; i++) { - switch (node.Type) + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) result = results[i]; + if (!RegexCharClass.IsNegated(result.Set)) { - case RegexNode.Atomic: - case RegexNode.Capture: - case RegexNode.Concatenate: - node = node.Child(0); - break; + int count = RegexCharClass.GetSetChars(result.Set, scratch); + if (count != 0) + { + if (result.CaseInsensitive && !RegexCharClass.ParticipatesInCaseConversion(scratch.Slice(0, count))) + { + result.CaseInsensitive = false; + } - default: - return null; + if (!result.CaseInsensitive) + { + result.Chars = scratch.Slice(0, count).ToArray(); + } + + results[i] = result; + } } } - Debug.Assert(node.Type == RegexNode.Alternate); - // Create RegexCharClasses to store the built-up sets. We may end up returning fewer - // than this if we find we can't easily fill this number of sets with 100% confidence. - var classes = new RegexCharClass?[maxChars]; - bool caseInsensitive = false; + // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search + // for the fastest and that have the best chance of matching as few false positives as possible. + results.Sort((s1, s2) => + { + if (s1.CaseInsensitive != s2.CaseInsensitive) + { + // If their case-sensitivities don't match, whichever is case-sensitive comes first / is considered lower. + return s1.CaseInsensitive ? 1 : -1; + } + + if (s1.Chars is not null && s2.Chars is not null) + { + // Then of the ones that are the same length, prefer those with less frequent values. The frequency is + // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True + // frequencies will vary widely based on the actual data being searched, the language of the data, etc. + int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars)); + if (c != 0) + { + return c; + } - int branches = node.ChildCount(); - Debug.Assert(branches >= 2); - for (int branchNum = 0; branchNum < branches; branchNum++) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static float SumFrequencies(char[] chars) + { + float sum = 0; + foreach (char c in chars) + { + // Lookup each character in the table. For values > 255, this will end up truncating + // and thus we'll get skew in the data. It's already a gross approximation, though, + // and it is primarily meant for disambiguation of ASCII letters. + sum += s_frequency[(byte)c]; + } + return sum; + } + } + else if (s1.Chars is not null) + { + // If s1 has chars and s2 doesn't, then s1 has fewer chars. + return -1; + } + else if (s2.Chars is not null) + { + // If s2 has chars and s1 doesn't, then s2 has fewer chars. + return 1; + } + + return s1.Distance.CompareTo(s2.Distance); + }); + + return results; + + // Starting from the specified root node, populates results with any characters at a fixed distance + // from the node's starting position. The function returns true if the entire contents of the node + // is at a fixed distance, in which case distance will have been updated to include the full length + // of the node. If it returns false, the node isn't entirely fixed, in which case subsequent nodes + // shouldn't be examined and distance should no longer be trusted. However, regardless of whether it + // returns true or false, it may have populated results, and all populated results are valid. + static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> results, ref int distance, CultureInfo culture, bool thorough) { - RegexNode alternateBranch = node.Child(branchNum); - caseInsensitive |= (alternateBranch.Options & RegexOptions.IgnoreCase) != 0; + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return false; + } - switch (alternateBranch.Type) + if ((node.Options & RegexOptions.RightToLeft) != 0) { + return false; + } + + bool caseInsensitive = (node.Options & RegexOptions.IgnoreCase) != 0; + + switch (node.Type) + { + case RegexNode.One: + if (results.Count < MaxFixedResults) + { + string setString = RegexCharClass.OneToStringClass(node.Ch, caseInsensitive ? culture : null, out bool resultIsCaseInsensitive); + results.Add((null, setString, distance++, resultIsCaseInsensitive)); + return true; + } + return false; + + case RegexNode.Onelazy or RegexNode.Oneloop or RegexNode.Oneloopatomic when node.M > 0: + { + string setString = RegexCharClass.OneToStringClass(node.Ch, caseInsensitive ? culture : null, out bool resultIsCaseInsensitive); + int minIterations = Math.Min(node.M, MaxLoopExpansion); + int i = 0; + for (; i < minIterations && results.Count < MaxFixedResults; i++) + { + results.Add((null, setString, distance++, resultIsCaseInsensitive)); + } + return i == node.M && i == node.N; + } + case RegexNode.Multi: - maxChars = Math.Min(maxChars, alternateBranch.Str!.Length); - for (int i = 0; i < maxChars; i++) { - (classes[i] ??= new RegexCharClass()).AddChar(alternateBranch.Str[i]); + string s = node.Str!; + int i = 0; + for (; i < s.Length && results.Count < MaxFixedResults; i++) + { + string setString = RegexCharClass.OneToStringClass(s[i], caseInsensitive ? culture : null, out bool resultIsCaseInsensitive); + results.Add((null, setString, distance++, resultIsCaseInsensitive)); + } + return i == s.Length; } - continue; + + case RegexNode.Set: + if (results.Count < MaxFixedResults) + { + results.Add((null, node.Str!, distance++, caseInsensitive)); + return true; + } + return false; + + case RegexNode.Setlazy or RegexNode.Setloop or RegexNode.Setloopatomic when node.M > 0: + { + int minIterations = Math.Min(node.M, MaxLoopExpansion); + int i = 0; + for (; i < minIterations && results.Count < MaxFixedResults; i++) + { + results.Add((null, node.Str!, distance++, caseInsensitive)); + } + return i == node.M && i == node.N; + } + + case RegexNode.Notone: + // We could create a set out of Notone, but it will be of little value in helping to improve + // the speed of finding the first place to match, as almost every character will match it. + distance++; + return true; + + case RegexNode.Notonelazy or RegexNode.Notoneloop or RegexNode.Notoneloopatomic when node.M == node.N: + distance += node.M; + return true; + + case RegexNode.Beginning: + case RegexNode.Bol: + case RegexNode.Boundary: + case RegexNode.ECMABoundary: + case RegexNode.Empty: + case RegexNode.End: + case RegexNode.EndZ: + case RegexNode.Eol: + case RegexNode.NonBoundary: + case RegexNode.NonECMABoundary: + case RegexNode.UpdateBumpalong: + case RegexNode.Start: + case RegexNode.Prevent: + case RegexNode.Require: + // Zero-width anchors and assertions. In theory for Prevent and Require we could also investigate + // them and use the learned knowledge to impact the generated sets, at least for lookaheads. + // For now, we don't bother. + return true; + + case RegexNode.Atomic: + case RegexNode.Group: + case RegexNode.Capture: + return TryFindFixedSets(node.Child(0), results, ref distance, culture, thorough); + + case RegexNode.Lazyloop or RegexNode.Loop when node.M > 0: + // This effectively only iterates the loop once. If deemed valuable, + // it could be updated in the future to duplicate the found results + // (updated to incorporate distance from previous iterations) and + // summed distance for all node.M iterations. If node.M == node.N, + // this would then also allow continued evaluation of the rest of the + // expression after the loop. + TryFindFixedSets(node.Child(0), results, ref distance, culture, thorough); + return false; case RegexNode.Concatenate: { - int classPos = 0; - int concatChildren = alternateBranch.ChildCount(); - for (int i = 0; i < concatChildren && classPos < classes.Length; i++) + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) { - RegexNode concatChild = alternateBranch.Child(i); - caseInsensitive |= (concatChild.Options & RegexOptions.IgnoreCase) != 0; + if (!TryFindFixedSets(node.Child(i), results, ref distance, culture, thorough)) + { + return false; + } + } + return true; + } - switch (concatChild.Type) + case RegexNode.Alternate when thorough: + { + int childCount = node.ChildCount(); + bool allSameSize = true; + int? sameDistance = null; + var combined = new Dictionary(); + + var localResults = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>(); + for (int i = 0; i < childCount; i++) + { + localResults.Clear(); + int localDistance = 0; + allSameSize &= TryFindFixedSets(node.Child(i), localResults, ref localDistance, culture, thorough); + + if (localResults.Count == 0) { - case RegexNode.One: - (classes[classPos++] ??= new RegexCharClass()).AddChar(concatChild.Ch); - break; - case RegexNode.Set: - if (!(classes[classPos++] ??= new RegexCharClass()).TryAddCharClass(RegexCharClass.Parse(concatChild.Str!))) - { - // If the classes can't be merged, give up. - return null; - } - break; - case RegexNode.Multi: - for (int c = 0; c < concatChild.Str!.Length && classPos < classes.Length; c++) + return false; + } + + if (allSameSize) + { + if (sameDistance is null) + { + sameDistance = localDistance; + } + else if (sameDistance.Value != localDistance) + { + allSameSize = false; + } + } + + foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) fixedSet in localResults) + { + if (combined.TryGetValue(fixedSet.Distance, out (RegexCharClass Set, bool CaseInsensitive, int Count) value)) + { + if (fixedSet.CaseInsensitive == value.CaseInsensitive && + value.Set.TryAddCharClass(RegexCharClass.Parse(fixedSet.Set))) { - (classes[classPos++] ??= new RegexCharClass()).AddChar(concatChild.Str[c]); + value.Count++; + combined[fixedSet.Distance] = value; } - break; + } + else + { + combined[fixedSet.Distance] = (RegexCharClass.Parse(fixedSet.Set), fixedSet.CaseInsensitive, 1); + } + } + } + + foreach (KeyValuePair pair in combined) + { + if (results.Count >= MaxFixedResults) + { + allSameSize = false; + break; + } - default: // nothing else supported - i = concatChildren; // stop looking at additional nodes - break; + if (pair.Value.Count == childCount) + { + results.Add((null, pair.Value.Set.ToStringClass(), pair.Key + distance, pair.Value.CaseInsensitive)); } } - maxChars = Math.Min(maxChars, classPos); + if (allSameSize) + { + Debug.Assert(sameDistance.HasValue); + distance += sameDistance.Value; + return true; + } + + return false; } - continue; default: - // Any other node type as a branch in the alternation and we give up. Note that we don't special-case One/Notone/Set - // because that would mean the whole branch was a single char, in which case this computation provides - // zero benefit over the ComputeFirstCharClass computation. - return null; + return false; } } + } - // We've now examined all of the alternate branches and were able to successfully process them. - // Determine how many we can actually return. - for (int i = 0; i < maxChars; i++) - { - if (classes[i] is null) - { - maxChars = i; - break; - } - } + // Computes a character class for the first character in tree. This uses a more robust algorithm + // than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example, + // fixed literals won't find the starting set for a*b, as the a isn't guaranteed and the b is at a + // variable position, but this will find [ab] as it's instead looking for anything that under any + // circumstance could possibly start a match. + public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexTree tree, CultureInfo culture) + { + var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]); + RegexFC? fc = s.RegexFCFromRegexTree(tree); + s.Dispose(); - // Make sure we got something. - if (maxChars == 0) + if (fc == null || fc._nullable) { return null; } - // Create and return the RegexPrefix objects. - var prefixes = new (string CharClass, bool CaseInsensitive)[maxChars]; - - CultureInfo? ci = null; - if (caseInsensitive) - { - ci = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - } - - for (int i = 0; i < prefixes.Length; i++) + if (fc.CaseInsensitive) { - if (caseInsensitive) - { - classes[i]!.AddLowercase(ci!); - } - prefixes[i] = (classes[i]!.ToStringClass(), caseInsensitive); + fc.AddLowercase(culture); } - return prefixes; + return (fc.GetFirstChars(), fc.CaseInsensitive); } /// Takes a RegexTree and computes the leading anchor that it encounters. @@ -619,6 +883,84 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex) throw new ArgumentException(SR.Format(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture))); } } + + /// Percent occurrences in source text (100 * char count / total count). + private static readonly float[] s_frequency = new float[] + { + 0.000f /* '\x00' */, 0.000f /* '\x01' */, 0.000f /* '\x02' */, 0.000f /* '\x03' */, 0.000f /* '\x04' */, 0.000f /* '\x05' */, 0.000f /* '\x06' */, 0.000f /* '\x07' */, + 0.000f /* '\x08' */, 0.001f /* '\x09' */, 0.000f /* '\x0A' */, 0.000f /* '\x0B' */, 0.000f /* '\x0C' */, 0.000f /* '\x0D' */, 0.000f /* '\x0E' */, 0.000f /* '\x0F' */, + 0.000f /* '\x10' */, 0.000f /* '\x11' */, 0.000f /* '\x12' */, 0.000f /* '\x13' */, 0.003f /* '\x14' */, 0.000f /* '\x15' */, 0.000f /* '\x16' */, 0.000f /* '\x17' */, + 0.000f /* '\x18' */, 0.004f /* '\x19' */, 0.000f /* '\x1A' */, 0.000f /* '\x1B' */, 0.006f /* '\x1C' */, 0.006f /* '\x1D' */, 0.000f /* '\x1E' */, 0.000f /* '\x1F' */, + 8.952f /* ' ' */, 0.065f /* ' !' */, 0.420f /* ' "' */, 0.010f /* ' #' */, 0.011f /* ' $' */, 0.005f /* ' %' */, 0.070f /* ' &' */, 0.050f /* ' '' */, + 3.911f /* ' (' */, 3.910f /* ' )' */, 0.356f /* ' *' */, 2.775f /* ' +' */, 1.411f /* ' ,' */, 0.173f /* ' -' */, 2.054f /* ' .' */, 0.677f /* ' /' */, + 1.199f /* ' 0' */, 0.870f /* ' 1' */, 0.729f /* ' 2' */, 0.491f /* ' 3' */, 0.335f /* ' 4' */, 0.269f /* ' 5' */, 0.435f /* ' 6' */, 0.240f /* ' 7' */, + 0.234f /* ' 8' */, 0.196f /* ' 9' */, 0.144f /* ' :' */, 0.983f /* ' ;' */, 0.357f /* ' <' */, 0.661f /* ' =' */, 0.371f /* ' >' */, 0.088f /* ' ?' */, + 0.007f /* ' @' */, 0.763f /* ' A' */, 0.229f /* ' B' */, 0.551f /* ' C' */, 0.306f /* ' D' */, 0.449f /* ' E' */, 0.337f /* ' F' */, 0.162f /* ' G' */, + 0.131f /* ' H' */, 0.489f /* ' I' */, 0.031f /* ' J' */, 0.035f /* ' K' */, 0.301f /* ' L' */, 0.205f /* ' M' */, 0.253f /* ' N' */, 0.228f /* ' O' */, + 0.288f /* ' P' */, 0.034f /* ' Q' */, 0.380f /* ' R' */, 0.730f /* ' S' */, 0.675f /* ' T' */, 0.265f /* ' U' */, 0.309f /* ' V' */, 0.137f /* ' W' */, + 0.084f /* ' X' */, 0.023f /* ' Y' */, 0.023f /* ' Z' */, 0.591f /* ' [' */, 0.085f /* ' \' */, 0.590f /* ' ]' */, 0.013f /* ' ^' */, 0.797f /* ' _' */, + 0.001f /* ' `' */, 4.596f /* ' a' */, 1.296f /* ' b' */, 2.081f /* ' c' */, 2.005f /* ' d' */, 6.903f /* ' e' */, 1.494f /* ' f' */, 1.019f /* ' g' */, + 1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */, + 1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */, + 0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */, + 0.000f /* '\x80' */, 0.000f /* '\x81' */, 0.000f /* '\x82' */, 0.000f /* '\x83' */, 0.000f /* '\x84' */, 0.000f /* '\x85' */, 0.000f /* '\x86' */, 0.000f /* '\x87' */, + 0.000f /* '\x88' */, 0.000f /* '\x89' */, 0.000f /* '\x8A' */, 0.000f /* '\x8B' */, 0.000f /* '\x8C' */, 0.000f /* '\x8D' */, 0.000f /* '\x8E' */, 0.000f /* '\x8F' */, + 0.000f /* '\x90' */, 0.000f /* '\x91' */, 0.000f /* '\x92' */, 0.000f /* '\x93' */, 0.000f /* '\x94' */, 0.000f /* '\x95' */, 0.000f /* '\x96' */, 0.000f /* '\x97' */, + 0.000f /* '\x98' */, 0.000f /* '\x99' */, 0.000f /* '\x9A' */, 0.000f /* '\x9B' */, 0.000f /* '\x9C' */, 0.000f /* '\x9D' */, 0.000f /* '\x9E' */, 0.000f /* '\x9F' */, + 0.000f /* '\xA0' */, 0.000f /* '\xA1' */, 0.000f /* '\xA2' */, 0.000f /* '\xA3' */, 0.000f /* '\xA4' */, 0.000f /* '\xA5' */, 0.000f /* '\xA6' */, 0.000f /* '\xA7' */, + 0.000f /* '\xA8' */, 0.000f /* '\xA9' */, 0.000f /* '\xAA' */, 0.000f /* '\xAB' */, 0.000f /* '\xAC' */, 0.000f /* '\xAD' */, 0.000f /* '\xAE' */, 0.000f /* '\xAF' */, + 0.000f /* '\xB0' */, 0.000f /* '\xB1' */, 0.000f /* '\xB2' */, 0.000f /* '\xB3' */, 0.000f /* '\xB4' */, 0.000f /* '\xB5' */, 0.000f /* '\xB6' */, 0.000f /* '\xB7' */, + 0.000f /* '\xB8' */, 0.000f /* '\xB9' */, 0.000f /* '\xBA' */, 0.000f /* '\xBB' */, 0.000f /* '\xBC' */, 0.000f /* '\xBD' */, 0.000f /* '\xBE' */, 0.000f /* '\xBF' */, + 0.000f /* '\xC0' */, 0.000f /* '\xC1' */, 0.000f /* '\xC2' */, 0.000f /* '\xC3' */, 0.000f /* '\xC4' */, 0.000f /* '\xC5' */, 0.000f /* '\xC6' */, 0.000f /* '\xC7' */, + 0.000f /* '\xC8' */, 0.000f /* '\xC9' */, 0.000f /* '\xCA' */, 0.000f /* '\xCB' */, 0.000f /* '\xCC' */, 0.000f /* '\xCD' */, 0.000f /* '\xCE' */, 0.000f /* '\xCF' */, + 0.000f /* '\xD0' */, 0.000f /* '\xD1' */, 0.000f /* '\xD2' */, 0.000f /* '\xD3' */, 0.000f /* '\xD4' */, 0.000f /* '\xD5' */, 0.000f /* '\xD6' */, 0.000f /* '\xD7' */, + 0.000f /* '\xD8' */, 0.000f /* '\xD9' */, 0.000f /* '\xDA' */, 0.000f /* '\xDB' */, 0.000f /* '\xDC' */, 0.000f /* '\xDD' */, 0.000f /* '\xDE' */, 0.000f /* '\xDF' */, + 0.000f /* '\xE0' */, 0.000f /* '\xE1' */, 0.000f /* '\xE2' */, 0.000f /* '\xE3' */, 0.000f /* '\xE4' */, 0.000f /* '\xE5' */, 0.000f /* '\xE6' */, 0.000f /* '\xE7' */, + 0.000f /* '\xE8' */, 0.000f /* '\xE9' */, 0.000f /* '\xEA' */, 0.000f /* '\xEB' */, 0.000f /* '\xEC' */, 0.000f /* '\xED' */, 0.000f /* '\xEE' */, 0.000f /* '\xEF' */, + 0.000f /* '\xF0' */, 0.000f /* '\xF1' */, 0.000f /* '\xF2' */, 0.000f /* '\xF3' */, 0.000f /* '\xF4' */, 0.000f /* '\xF5' */, 0.000f /* '\xF6' */, 0.000f /* '\xF7' */, + 0.000f /* '\xF8' */, 0.000f /* '\xF9' */, 0.000f /* '\xFA' */, 0.000f /* '\xFB' */, 0.000f /* '\xFC' */, 0.000f /* '\xFD' */, 0.000f /* '\xFE' */, 0.000f /* '\xFF' */, + }; + + // The above table was generated programmatically with the following. This can be augmented to incorporate additional data sources, + // though it is only intended to be a rough approximation use when tie-breaking and we'd otherwise be picking randomly, so, it's something. + // The frequencies may be wildly inaccurate when used with data sources different in nature than the training set, in which case we shouldn't + // be much worse off than just picking randomly: + // + // using System.Runtime.InteropServices; + // + // var counts = new Dictionary(); + // + // (string, string)[] rootsAndExtensions = new[] + // { + // (@"d:\repos\runtime\src\", "*.cs"), // C# files in dotnet/runtime + // (@"d:\Top25GutenbergBooks", "*.txt"), // Top 25 most popular books on Project Gutenberg + // }; + // + // foreach ((string root, string ext) in rootsAndExtensions) + // foreach (string path in Directory.EnumerateFiles(root, ext, SearchOption.AllDirectories)) + // foreach (string line in File.ReadLines(path)) + // foreach (char c in line.AsSpan().Trim()) + // CollectionsMarshal.GetValueRefOrAddDefault(counts, (byte)c, out _)++; + // + // long total = counts.Sum(i => i.Value); + // + // Console.WriteLine("/// Percent occurrences in source text (100 * char count / total count)."); + // Console.WriteLine("private static readonly float[] s_frequency = new float[]"); + // Console.WriteLine("{"); + // int i = 0; + // for (int row = 0; row < 32; row++) + // { + // Console.Write(" "); + // for (int col = 0; col < 8; col++) + // { + // counts.TryGetValue((byte)i, out long charCount); + // float frequency = (float)(charCount / (double)total) * 100; + // Console.Write($" {frequency:N3}f /* '{(i >= 32 && i < 127 ? $" {(char)i}" : $"\\x{i:X2}")}' */,"); + // i++; + // } + // Console.WriteLine(); + // } + // Console.WriteLine("};"); } internal sealed class RegexFC diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 2154947cfaa8d..93420b2381381 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -4,6 +4,7 @@ using System.Collections; using System.Collections.Generic; using System.Globalization; +using System.Runtime.InteropServices; namespace System.Text.RegularExpressions { @@ -38,10 +39,10 @@ private RegexWriter(Span emittedSpan, Span intStackSpan) /// This is the only function that should be called from outside. /// It takes a RegexTree and creates a corresponding RegexCode. /// - public static RegexCode Write(RegexTree tree) + public static RegexCode Write(RegexTree tree, CultureInfo culture) { var writer = new RegexWriter(stackalloc int[EmittedSize], stackalloc int[IntStackSize]); - RegexCode code = writer.RegexCodeFromRegexTree(tree); + RegexCode code = writer.RegexCodeFromRegexTree(tree, culture); writer.Dispose(); #if DEBUG @@ -71,7 +72,7 @@ public void Dispose() /// It also computes various information about the tree, such as /// prefix data to help with optimizations. /// - public RegexCode RegexCodeFromRegexTree(RegexTree tree) + public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture) { // Construct sparse capnum mapping if some numbers are unused. int capsize; @@ -131,46 +132,6 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree) Emit(RegexCode.Stop); int[] emitted = _emitted.AsSpan().ToArray(); - bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0; - bool compiled = (tree.Options & RegexOptions.Compiled) != 0; - - // Compute prefixes to help optimize FindFirstChar. - RegexBoyerMoore? boyerMoorePrefix = null; - (string CharClass, bool CaseInsensitive)[]? leadingCharClasses = null; - (string leadingSubstring, bool leadingSubstringCI) = RegexPrefixAnalyzer.ComputeLeadingSubstring(tree); - if (leadingSubstring.Length > 1 && // if it's <= 1, perf is better using leadingCharClasses - leadingSubstring.Length <= RegexBoyerMoore.MaxLimit) - { - // Compute a Boyer-Moore prefix if we find a single string of sufficient length that always begins the expression. - CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - boyerMoorePrefix = new RegexBoyerMoore(leadingSubstring, leadingSubstringCI, rtl, culture); - } - - // If we didn't find a single leading substring, or if we found one but we won't be able to use it for a Boyer-Moore - // search, try to compute the characters set that might begin the string. - if (boyerMoorePrefix is null || - (boyerMoorePrefix.NegativeUnicode != null && compiled)) // compilation won't use Boyer-Moore if it has a negative Unicode table - { - boyerMoorePrefix = null; - - // First we employ a less aggressive but more valuable computation to see if we can find sets for each of the first N - // characters in the string. If that's unsuccessful, we employ a more aggressive check to compute a set for just - // the first character in the string. - - if ((tree.Options & RegexOptions.Compiled) != 0) // currently not utilized by the interpreter - { - leadingCharClasses = RegexPrefixAnalyzer.ComputeMultipleCharClasses(tree, maxChars: 5); // limit of 5 is based on experimentation and can be tweaked as needed - } - - if (leadingCharClasses is null) - { - leadingCharClasses = RegexPrefixAnalyzer.ComputeFirstCharClass(tree); - } - } - - // Compute any anchors starting the expression. - int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree); - // Convert the string table into an ordered string array. var strings = new string[_stringTable.Count]; foreach (KeyValuePair stringEntry in _stringTable) @@ -179,7 +140,7 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree) } // Return all that in a RegexCode object. - return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl); + return new RegexCode(tree, culture, emitted, strings, _trackCount, _caps, capsize); } /// @@ -233,16 +194,23 @@ private void Emit(int op, int opd1, int opd2) /// /// Returns an index in the string table for a string; - /// uses a hashtable to eliminate duplicates. + /// uses a dictionary to eliminate duplicates. /// private int StringCode(string str) { +#if REGEXGENERATOR if (!_stringTable.TryGetValue(str, out int i)) { i = _stringTable.Count; _stringTable.Add(str, i); } - +#else + ref int i = ref CollectionsMarshal.GetValueRefOrAddDefault(_stringTable, str, out bool exists); + if (!exists) + { + i = _stringTable.Count - 1; + } +#endif return i; } @@ -265,7 +233,7 @@ private int MapCapnum(int capnum) => private void EmitFragment(int nodetype, RegexNode node, int curIndex) { int bits = 0; - if (node.UseOptionR()) + if ((node.Options & RegexOptions.RightToLeft) != 0) { bits |= RegexCode.Rtl; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs index a5f79f27ceaac..5ffe1da8ce163 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs @@ -240,6 +240,7 @@ public BDD[] TopologicalSort() /// Serializer uses more compacted representations when fewer bits are needed, which is reflected in the first /// two numbers of the return value. MTBDD terminals are represented by negated numbers as -id. /// + [ExcludeFromCodeCoverage] public long[] Serialize() { if (IsEmpty) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index bf75d21a05fcf..1fec095c54b10 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -3,7 +3,6 @@ using System.Collections.Generic; using System.Diagnostics; -using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions.Symbolic @@ -406,7 +405,6 @@ private DfaMatchingState MakeNewState(DfaMatchingState state lock (this) { state.Id = _stateCache.Count; - int k = state.GetHashCode(); _stateCache.Add(state); Debug.Assert(_statearray is not null); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index 3bd803e47f338..ba522d513392c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -19,28 +19,6 @@ internal readonly struct SymbolicRegexInfo private SymbolicRegexInfo(uint i) => _info = i; - /// Optimized lookup array for most common combinations. - /// Most common cases will be 0 (no anchors and not nullable) and 1 (no anchors and nullable) - private static readonly SymbolicRegexInfo[] s_infos = CreateSymbolicRegexInfos(); - - private static SymbolicRegexInfo[] CreateSymbolicRegexInfos() - { - var infos = new SymbolicRegexInfo[128]; - for (uint i = 0; i < infos.Length; i++) - { - infos[i] = new SymbolicRegexInfo(i); - } - return infos; - } - - private static SymbolicRegexInfo Mk(uint i) - { - SymbolicRegexInfo[] infos = s_infos; - return i < infos.Length ? - infos[i] : - new SymbolicRegexInfo(i); - } - internal static SymbolicRegexInfo Mk(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false, bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false, bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true) @@ -87,7 +65,7 @@ internal static SymbolicRegexInfo Mk(bool isAlwaysNullable = false, bool canBeNu i |= IsLazyMask; } - return Mk(i); + return new SymbolicRegexInfo(i); } public bool IsNullable => (_info & IsAlwaysNullableMask) != 0; @@ -121,7 +99,7 @@ public static SymbolicRegexInfo Or(SymbolicRegexInfo[] infos) } i = (i & ~IsLazyMask) | isLazy; - return Mk(i); + return new SymbolicRegexInfo(i); } public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) @@ -140,7 +118,7 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) i = (i & ~IsLazyMask) | isLazy; i = (i & ~(IsAlwaysNullableMask | CanBeNullableMask)) | isNullable; - return Mk(i); + return new SymbolicRegexInfo(i); } public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) @@ -164,7 +142,10 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound uint i = body_info._info; // The loop is nullable if either the body is nullable or if the lower boud is 0 - i |= lowerBound == 0 ? (IsAlwaysNullableMask | CanBeNullableMask) : 0; + if (lowerBound == 0) + { + i |= IsAlwaysNullableMask | CanBeNullableMask; + } // The loop is lazy iff it is marked lazy if (isLazy) @@ -176,7 +157,7 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound i &= ~IsLazyMask; } - return Mk(i); + return new SymbolicRegexInfo(i); } public static SymbolicRegexInfo Not(SymbolicRegexInfo info) => diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index a8cec12036225..21c023345c731 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -11,15 +11,8 @@ namespace System.Text.RegularExpressions.Symbolic { /// Represents a regex matching engine that performs regex matching using symbolic derivatives. - internal abstract class SymbolicRegexMatcher + internal interface ISymbolicRegexMatcher { - /// Returns the next match index and length in the input string. - /// Whether to return once we know there's a match without determining where exactly it matched. - /// The input string. - /// The start position in the input. - /// The end position in the input. - public abstract SymbolicMatch FindMatch(bool isMatch, string input, int startat, int endat); - #if DEBUG /// Unwind the regex of the matcher and save the resulting state graph in DGML /// roughly the maximum number of states, 0 means no bound @@ -30,8 +23,7 @@ internal abstract class SymbolicRegexMatcher /// dgml output is written here /// maximum length of labels in nodes anything over that length is indicated with .. /// if true creates NFA instead of DFA - public abstract void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA); - + void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA); /// /// Generates up to k random strings matched by the regex @@ -40,13 +32,13 @@ internal abstract class SymbolicRegexMatcher /// random seed for the generator, 0 means no random seed /// if true then generate inputs that do not match /// - public abstract IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative); + IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative); #endif } /// Represents a regex matching engine that performs regex matching using symbolic derivatives. /// Character set type. - internal sealed class SymbolicRegexMatcher : SymbolicRegexMatcher where TSetType : notnull + internal sealed class SymbolicRegexMatcher : ISymbolicRegexMatcher where TSetType : notnull { /// Maximum number of states before switching over to Antimirov mode. /// @@ -127,54 +119,24 @@ internal sealed class SymbolicRegexMatcher : SymbolicRegexMatcher wher /// Timeout in milliseconds. This is only used if is true. private readonly int _timeout; - /// Classifier used to say whether a particular character can start a match for . - internal readonly BooleanClassifier _startSetClassifier; - - /// Predicate over characters that make some progress - private readonly TSetType _startSet; - - /// Maximum allowed size of . - private const int StartSetArrayMaxSize = 5; - - /// String of at most many characters - private readonly char[] _startSetArray; - - /// Number of elements in - private readonly int _startSetSize; - - /// If nonempty then has that fixed prefix - private readonly string _prefix; + /// Data and routines for skipping ahead to the next place a match could potentially start. + private readonly RegexFindOptimizations? _findOpts; - /// Non-null when is nonempty - private readonly RegexBoyerMoore? _prefixBoyerMoore; + /// The initial states for the original pattern, keyed off of the previous character kind. + /// If the pattern doesn't contain any anchors, there will only be a single initial state. + private readonly DfaMatchingState[] _initialStates; - /// If true then the fixed prefix of is idependent of case - private readonly bool _isPrefixCaseInsensitive; + /// The initial states for the dot-star pattern, keyed off of the previous character kind. + /// If the pattern doesn't contain any anchors, there will only be a single initial state. + private readonly DfaMatchingState[] _dotstarredInitialStates; - /// Cached skip states from the initial state of for the 5 possible previous character kinds. - private readonly DfaMatchingState?[] _prefixSkipStates = new DfaMatchingState[CharKind.CharKindCount]; - /// Cached skip states from the initial state of Ar for the 5 possible previous character kinds. - private readonly DfaMatchingState?[] _reversePrefixSkipStates = new DfaMatchingState[CharKind.CharKindCount]; + /// The initial states for the reverse pattern, keyed off of the previous character kind. + /// If the pattern doesn't contain any anchors, there will only be a single initial state. + private readonly DfaMatchingState[] _reverseInitialStates; - private readonly string _reversePrefix; - - private readonly DfaMatchingState[] _initialStates = new DfaMatchingState[CharKind.CharKindCount]; - private readonly DfaMatchingState[] _dotstarredInitialStates = new DfaMatchingState[CharKind.CharKindCount]; - private readonly DfaMatchingState[] _reverseInitialStates = new DfaMatchingState[CharKind.CharKindCount]; - - private readonly uint[] _asciiCharKinds = new uint[128]; - - internal readonly CultureInfo _culture; - - private DfaMatchingState GetSkipState(uint prevCharKind) => - Volatile.Read(ref _prefixSkipStates[prevCharKind]) ?? - Interlocked.CompareExchange(ref _prefixSkipStates[prevCharKind], DeltaPlus(_prefix, _dotstarredInitialStates[prevCharKind]), null) ?? - _prefixSkipStates[prevCharKind]!; - - private DfaMatchingState GetReverseSkipState(uint prevCharKind) => - Volatile.Read(ref _reversePrefixSkipStates[prevCharKind]) ?? - Interlocked.CompareExchange(ref _reversePrefixSkipStates[prevCharKind], DeltaPlus(_reversePrefix, _reverseInitialStates[prevCharKind]), null) ?? - _reversePrefixSkipStates[prevCharKind]!; + /// Lookup table to quickly determine the character kind for ASCII characters. + /// Non-null iff the pattern contains anchors; otherwise, it's unused. + private readonly uint[]? _asciiCharKinds; /// Get the minterm of . /// character code @@ -186,16 +148,14 @@ private TSetType GetMinterm(int c) } /// Constructs matcher for given symbolic regex. - internal SymbolicRegexMatcher(SymbolicRegexNode sr, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) + internal SymbolicRegexMatcher(SymbolicRegexNode sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) { + Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}"); + _pattern = sr; _builder = sr._builder; - _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms - _culture = culture; - - Debug.Assert(_builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {_builder._solver}"); _partitions = _builder._solver switch { BV64Algebra bv64 => bv64._classifier, @@ -203,44 +163,57 @@ internal SymbolicRegexMatcher(SymbolicRegexNode sr, CharSetSolver css, _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms), }; - _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); - _reversePattern = _pattern.Reverse(); - ConfigureRegexes(); - - _startSet = _pattern.GetStartSet(); - if (!_builder._solver.IsSatisfiable(_startSet) || _pattern.CanBeNullable) + if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && + code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. { - // If the startset is empty make it full instead by including all characters - // this is to ensure that startset is nonempty -- as an invariant assumed by operations using it - // - // Also, if A can be nullable then effectively disable use of startset by making it true - // because it may force search of next character in startset and fail to recognize an empty match - // because (by definition) an empty match has no start character. - // - // For example (this is also a unit test): - // for pattern "\B\W*?" or "\B\W*" or "\B\W?" and input "e.g:abc" there is an empty match in position 5 - // but startset \W will force search beyond position 5 and fails to find that match - _startSet = _builder._solver.True; + _findOpts = code.FindOptimizations; } - _startSetSize = (int)_builder._solver.ComputeDomainSize(_startSet); + // Determine the number of initial states. If there's no anchor, only the default previous + // character kind 0 is ever going to be used for all initial states. + int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; - BDD startbdd = _builder._solver.ConvertToCharSet(css, _startSet); - _startSetClassifier = new BooleanClassifier(css, startbdd); - - //store the start characters in the A_startset_array if there are not too many characters - _startSetArray = _startSetSize <= StartSetArrayMaxSize ? - new List(css.GenerateAllCharacters(startbdd)).ToArray() : - Array.Empty(); + // Create the initial states for the original pattern. + var initialStates = new DfaMatchingState[statesCount]; + for (uint i = 0; i < initialStates.Length; i++) + { + initialStates[i] = _builder.MkState(_pattern, i); + } + _initialStates = initialStates; - _prefix = _pattern.GetFixedPrefix(css, culture.Name, out _isPrefixCaseInsensitive); - _reversePrefix = _reversePattern.GetFixedPrefix(css, culture.Name, out _); + // Create the dot-star pattern (a concatenation of any* with the original pattern) + // and all of its initial states. + _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); + var dotstarredInitialStates = new DfaMatchingState[statesCount]; + for (uint i = 0; i < dotstarredInitialStates.Length; i++) + { + // Used to detect if initial state was reentered, + // but observe that the behavior from the state may ultimately depend on the previous + // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, + // in that sense there can be several "versions" (not more than StateCount) of the initial state. + DfaMatchingState state = _builder.MkState(_dotStarredPattern, i); + state.IsInitialState = true; + dotstarredInitialStates[i] = state; + } + _dotstarredInitialStates = dotstarredInitialStates; - _prefixBoyerMoore = InitializePrefixBoyerMoore(); + // Create the reverse pattern (the original pattern in reverse order) and all of its + // initial states. + _reversePattern = _pattern.Reverse(); + var reverseInitialStates = new DfaMatchingState[statesCount]; + for (uint i = 0; i < reverseInitialStates.Length; i++) + { + reverseInitialStates[i] = _builder.MkState(_reversePattern, i); + } + _reverseInitialStates = reverseInitialStates; + // Initialize our fast-lookup for determining the character kind of ASCII characters. + // This is only required when the pattern contains anchors, as otherwise there's only + // ever a single kind used. if (_pattern._info.ContainsSomeAnchor) { - for (int i = 0; i < 128; i++) + var asciiCharKinds = new uint[128]; + for (int i = 0; i < asciiCharKinds.Length; i++) { TSetType predicate2; uint charKind; @@ -256,68 +229,12 @@ internal SymbolicRegexMatcher(SymbolicRegexNode sr, CharSetSolver css, charKind = CharKind.WordLetter; } - _asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; + asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; } + _asciiCharKinds = asciiCharKinds; } } - private RegexBoyerMoore? InitializePrefixBoyerMoore() - { - if (_prefix != string.Empty && _prefix.Length <= RegexBoyerMoore.MaxLimit && _prefix.Length > 1) - { - // RegexBoyerMoore expects the prefix to be lower case when case is ignored. - // Use the culture of the matcher. - string prefix = _isPrefixCaseInsensitive ? _prefix.ToLower(_culture) : _prefix; - return new RegexBoyerMoore(prefix, _isPrefixCaseInsensitive, rightToLeft: false, _culture); - } - - return null; - } - - private void ConfigureRegexes() - { - void Configure(uint i) - { - _initialStates[i] = _builder.MkState(_pattern, i); - - // Used to detect if initial state was reentered, then startset can be triggered - // but observe that the behavior from the state may ultimately depend on the previous - // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, - // in that sense there can be several "versions" (not more than StateCount) of the initial state. - _dotstarredInitialStates[i] = _builder.MkState(_dotStarredPattern, i); - _dotstarredInitialStates[i].IsInitialState = true; - - _reverseInitialStates[i] = _builder.MkState(_reversePattern, i); - } - - // Create initial states for A, A1 and Ar. - if (!_pattern._info.ContainsSomeAnchor) - { - // Only the default previous character kind 0 is ever going to be used for all initial states. - // _A1q0[0] is recognized as special initial state. - // This information is used for search optimization based on start set and prefix of A. - Configure(0); - } - else - { - for (uint i = 0; i < CharKind.CharKindCount; i++) - { - Configure(i); - } - } - } - - /// Return the state after the given string from the given state . - private DfaMatchingState DeltaPlus(string pattern, DfaMatchingState state) where TTransition : struct, ITransition - { - for (int i = 0; i < pattern.Length; i++) - { - state = Delta(pattern, i, state); - } - - return state; - } - /// Interface for transitions used by the method. private interface ITransition { @@ -341,7 +258,7 @@ private DfaMatchingState Delta(string input, int i, DfaMa minterms.Length : // mintermId = minterms.Length represents \Z (last \n) _partitions.GetMintermID(c); - TSetType minterm = (uint)mintermId < minterms.Length ? + TSetType minterm = (uint)mintermId < (uint)minterms.Length ? minterms[mintermId] : _builder._solver.False; // minterm=False represents \Z @@ -428,30 +345,21 @@ private DfaMatchingState CreateNewTransition(DfaMatchingState timeoutOccursAt && 0 < currentMillis) - return; - - //regex pattern is in general not available in srm and - //the input is not available here but could be passed as argument to DoCheckTimeout - throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout)); + if (currentMillis >= timeoutOccursAt && (0 <= timeoutOccursAt || 0 >= currentMillis)) + { + throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout)); + } } /// Find a match. /// Whether to return once we know there's a match without determining where exactly it matched. - /// input string - /// the position to start search in the input string - /// the next position after the end position in the input - public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, int k) + /// The input string + /// The position to start search in the input string. + /// The non-inclusive position to end the search in the input string. + public SymbolicMatch FindMatch(bool isMatch, string input, int startat, int end) { int timeoutOccursAt = 0; if (_checkTimeout) @@ -460,18 +368,16 @@ public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, timeoutOccursAt = Environment.TickCount + (int)(_timeout + 0.5); } - if (startat == k) + if (startat == end) { - //covers the special case when the remaining input suffix - //where a match is sought is empty (for example when the input is empty) - //in this case the only possible match is an empty match + // Covers the special-case of an empty match at the end of the input. uint prevKind = GetCharKind(input, startat - 1); uint nextKind = GetCharKind(input, startat); bool emptyMatchExists = _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind)); - return - !emptyMatchExists ? SymbolicMatch.NoMatch : - new SymbolicMatch(startat, 0); + return emptyMatchExists ? + new SymbolicMatch(startat, 0) : + SymbolicMatch.NoMatch; } // Find the first accepting state. Initial start position in the input is i == 0. @@ -479,7 +385,7 @@ public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, // May return -1 as a legitimate value when the initial state is nullable and startat == 0. // Returns NoMatchExists when there is no match. - i = FindFinalStatePosition(input, k, i, timeoutOccursAt, out int i_q0_A1, out int watchdog); + i = FindFinalStatePosition(input, end, i, timeoutOccursAt, out int i_q0_A1, out int watchdog); if (i == NoMatchExists) { @@ -502,24 +408,17 @@ public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, } else { - if (i < startat) - { - Debug.Assert(i == startat - 1); - i_start = startat; - } - else - { - // Walk in reverse to locate the start position of the match - i_start = FindStartPosition(input, i, i_q0_A1); - } - - i_end = FindEndPosition(input, k, i_start); + Debug.Assert(i >= startat - 1); + i_start = i < startat ? + startat : + FindStartPosition(input, i, i_q0_A1); // Walk in reverse to locate the start position of the match + i_end = FindEndPosition(input, end, i_start); } return new SymbolicMatch(i_start, i_end + 1 - i_start); } - /// Find match end position using A, end position is known to exist. + /// Find match end position using the original pattern, end position is known to exist. /// input array /// inclusive start position /// exclusive end position @@ -561,7 +460,7 @@ private int FindEndPosition(string input, int exclusiveEnd, int i) return i_end; } - // Inner loop for FindEndPosition parameterized by an ITransition type. + /// Inner loop for FindEndPosition parameterized by an ITransition type. [MethodImpl(MethodImplOptions.AggressiveInlining)] private bool FindEndPositionDeltas(string input, ref int i, int j, ref DfaMatchingState q, ref int i_end) where TTransition : struct, ITransition { @@ -582,7 +481,7 @@ private bool FindEndPositionDeltas(string input, ref int i, int j, } else if (q.IsDeadend) { - // Nonaccepting sink state (deadend) has been reached in A. + // Non-accepting sink state (deadend) has been reached in the original pattern. // So the match ended when the last i_end was updated. return true; } @@ -594,26 +493,18 @@ private bool FindEndPositionDeltas(string input, ref int i, int j, return false; } - /// Walk back in reverse using Ar to find the start position of match, start position is known to exist. + /// Walk back in reverse using the reverse pattern to find the start position of match, start position is known to exist. /// the input string /// position to start walking back from, i points at the last character of the match /// do not pass this boundary when walking back /// private int FindStartPosition(string input, int i, int match_start_boundary) { - // Fetch the correct start state for Ar. + // Fetch the correct start state for the reverse pattern. // This depends on previous character --- which, because going backwards, is character number i+1. uint prevKind = GetCharKind(input, i + 1); DfaMatchingState q = _reverseInitialStates[prevKind]; - // Ar may have a fixed prefix sequence - if (_reversePrefix.Length > 0) - { - //skip past the prefix portion of Ar - q = GetReverseSkipState(prevKind); - i -= _reversePrefix.Length; - } - if (i == -1) { Debug.Assert(q.IsNullable(GetCharKind(input, i)), "we reached the beginning of the input, thus the state q must be accepting"); @@ -623,12 +514,12 @@ private int FindStartPosition(string input, int i, int match_start_boundary) int last_start = -1; if (q.IsNullable(GetCharKind(input, i))) { - // The whole prefix of Ar was in reverse a prefix of A, - // for example when the pattern of A is concrete word such as "abc" + // The whole prefix of the reverse pattern was in reverse a prefix of the original pattern, + // for example when the original pattern is concrete word such as "abc" last_start = i + 1; } - //walk back to the accepting state of Ar + // Walk back to the accepting state of the reverse pattern while (i >= match_start_boundary) { int j = Math.Max(match_start_boundary, i - AntimirovThresholdLeeway); @@ -663,7 +554,7 @@ private bool FindStartPositionDeltas(string input, ref int i, int j if (q.IsNullable(GetCharKind(input, i - 1))) { // Earliest start point so far. This must happen at some point - // or else A1 would not have reached a final state after match_start_boundary. + // or else the dot-star pattern would not have reached a final state after match_start_boundary. last_start = i; } @@ -683,7 +574,7 @@ private bool FindStartPositionDeltas(string input, ref int i, int j /// length of match when positive private int FindFinalStatePosition(string input, int k, int i, int timeoutOccursAt, out int initialStateIndex, out int watchdog) { - // Get the correct start state of A1, which in general depends on the previous character kind in the input. + // Get the correct start state of the dot-star pattern, which in general depends on the previous character kind in the input. uint prevCharKindId = GetCharKind(input, i - 1); DfaMatchingState q = _dotstarredInitialStates[prevCharKindId]; initialStateIndex = i; @@ -712,53 +603,13 @@ private int FindFinalStatePosition(string input, int k, int i, int timeoutOccurs { if (q.IsInitialState) { - // i_q0_A1 is the most recent position in the input when A1 is in the initial state + // i_q0_A1 is the most recent position in the input when the dot-star pattern is in the initial state initialStateIndex = i; - if (_prefixBoyerMoore != null) + if (_findOpts is RegexFindOptimizations findOpts) { - // Stay in the initial state if the prefix does not match. - // Thus advance the current position to the first position where the prefix does match. - i = _prefixBoyerMoore.Scan(input, i, 0, input.Length); - - if (i == -1) // Scan returns -1 when a matching position does not exist - { - watchdog = -1; - return -2; - } - - // Compute the end state for the A prefix. - // Skip directly to the resulting state - // --- i.e. do the loop --- - // for (int j = 0; j < prefix.Length; j++) - // q = Delta(prefix[j], q, out regex); - // --- - q = GetSkipState(q.PrevCharKind); - - // skip the prefix - i += _prefix.Length; - - // here i points at the next character (the character immediately following the prefix) - if (q.IsNullable(GetCharKind(input, i))) - { - // Return the last position of the match - watchdog = q.WatchDog; - return i - 1; - } - - if (i == k) - { - // no match was found - return -2; - } - } - else - { - // we are still in the initial state, when the prefix is empty - // find the first position i that matches with some character in the start set - i = IndexOfStartSet(input, i); - - if (i == -1) + // Find the first position i that matches with some likely character. + if (!findOpts.TryFindNextStartingPosition(input, ref i, 0, 0, k)) { // no match was found return NoMatchExists; @@ -833,68 +684,45 @@ private bool FindFinalStatePositionDeltas(string input, int j, ref [MethodImpl(MethodImplOptions.AggressiveInlining)] private uint GetCharKind(string input, int i) { - if (!_pattern._info.ContainsSomeAnchor) - { - // The previous character kind is irrelevant when anchors are not used. - return CharKind.General; - } - - if (i == -1 || i == input.Length) - { - return CharKind.StartStop; - } + return !_pattern._info.ContainsSomeAnchor ? + CharKind.General : // The previous character kind is irrelevant when anchors are not used. + GetCharKindWithAnchor(input, i); - char nextChar = input[i]; - if (nextChar == '\n') + uint GetCharKindWithAnchor(string input, int i) { - return - _builder._newLinePredicate.Equals(_builder._solver.False) ? 0 : // ignore \n - i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z). - CharKind.Newline; - } - - uint[] asciiCharKinds = _asciiCharKinds; - return - nextChar < asciiCharKinds.Length ? asciiCharKinds[nextChar] : - _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character - CharKind.WordLetter; - } + Debug.Assert(_asciiCharKinds is not null); - /// - /// Find first occurrence of startset element in input starting from index i. - /// Startset here is assumed to consist of a few characters. - /// - /// input string to search in - /// the start index in input to search from - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private int IndexOfStartSet(string input, int i) - { - if (_startSetSize <= StartSetArrayMaxSize) - { - return input.IndexOfAny(_startSetArray, i); - } + if ((uint)i >= (uint)input.Length) + { + return CharKind.StartStop; + } - for (int j = i; j < input.Length; j++) - { - if (_startSetClassifier.IsTrue(input[j])) + char nextChar = input[i]; + if (nextChar == '\n') { - return j; + return + _builder._newLinePredicate.Equals(_builder._solver.False) ? 0 : // ignore \n + i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z). + CharKind.Newline; } - } - return -1; + uint[] asciiCharKinds = _asciiCharKinds; + return + nextChar < (uint)asciiCharKinds.Length ? asciiCharKinds[nextChar] : + _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character + CharKind.WordLetter; + } } #if DEBUG - public override void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA) + public void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA) { var graph = new DGML.RegexAutomaton(this, bound, addDotStar, inReverse, asNFA); var dgml = new DGML.DgmlWriter(writer, hideStateInfo, maxLabelLength, onlyDFAinfo); dgml.Write(graph); } - public override IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) => + public IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) => new SymbolicRegexSampler(_pattern, randomseed, negative).GenerateRandomMembers(k); #endif } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 5ecadcad26ff6..f7992bf43950d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -1424,152 +1424,6 @@ internal bool StartsWithLoop(int upperBoundLowestValue = 1) }; } - /// - /// Gets the string prefix that the regex must match or the empty string if such a prefix does not exist. - /// Sets ignoreCase = true when the prefix works under case-insensitivity. - /// For example if the input prefix is "---" it sets ignoreCase=false, - /// if the prefix is "---[aA][bB]" it returns "---AB" and sets ignoreCase=true - /// - internal string GetFixedPrefix(CharSetSolver css, string culture, out bool ignoreCase) - { - ignoreCase = false; - StringBuilder prefix = new(); - bool doneWithoutIgnoreCase = false; - bool doneWithIgnoreCase = false; - foreach (S x in GetPrefixSequence()) - { - BDD bdd = _builder._solver.ConvertToCharSet(css, x); - char character = (char)bdd.GetMin(); - // Check if the prefix extends without ignore case: the set is a single character - if (!doneWithoutIgnoreCase && !css.IsSingleton(bdd)) - { - doneWithoutIgnoreCase = true; - } - if (!doneWithIgnoreCase) - { - // Check if the prefix extends with ignore case: ignoring case doesn't change the set - if (css.ApplyIgnoreCase(css.CharConstraint(character), culture).Equals(bdd)) - { - // Turn ignoreCase on when the prefix extends only under ignore case - if (doneWithoutIgnoreCase) - { - ignoreCase = true; - } - } - else - { - doneWithIgnoreCase = true; - } - } - // Append the character when the prefix extends in either of the ways - if (!doneWithoutIgnoreCase || !doneWithIgnoreCase) - prefix.Append(character); - else - break; - } - return prefix.ToString(); - } - - private IEnumerable GetPrefixSequence() - { - List> paths = new(); - HashSet> nextPaths = new(); - - paths.Add(this); - while (true) - { - bool done = false; - Debug.Assert(paths.Count > 0, "The generator should have ended when any path fails to extend."); - // Generate the next set from one path - S next; - if (!GetNextPrefixSet(ref paths, ref nextPaths, ref done, out next)) - { - // A path didn't have a next set as supported by this algorithm - yield break; - } - if (!_builder._solver.IsSatisfiable(next)) - { - yield break; - } - while (paths.Count > 0) - { - // For all other paths check that they produce the same set - S newSet; - if (!GetNextPrefixSet(ref paths, ref nextPaths, ref done, out newSet) || !newSet.Equals(next)) - { - // Either a path didn't have a next set as supported by this algorithm, or the next set was not equal - yield break; - } - } - // At this point all paths generated equal next sets - yield return next; - if (done) - { - // Some path had no continuation, end the prefix - yield break; - } - else - { - Debug.Assert(paths.Count == 0, "Not all paths were considered for next set."); - paths.AddRange(nextPaths); - nextPaths.Clear(); - } - } - } - - private bool GetNextPrefixSet(ref List> paths, ref HashSet> nextPaths, ref bool done, out S set) - { - while (paths.Count > 0) - { - SymbolicRegexNode node = paths[paths.Count - 1]; - paths.RemoveAt(paths.Count - 1); - switch (node._kind) - { - case SymbolicRegexKind.Singleton: - Debug.Assert(node._set is not null); - set = node._set; - done = true; // No continuation, done after the next set - return true; - case SymbolicRegexKind.Concat: - Debug.Assert(node._left is not null && node._right is not null); - if (!node._left.CanBeNullable) - { - if (node._left.GetFixedLength() == 1) - { - set = node._left.GetStartSet(); - // Left side had just one character, can use just right side as path - nextPaths.Add(node._right); - return true; - } - else - { - // Left side may need multiple steps to get through. However, it is safe - // (though not complete) to forget the right side and just expand the path - // for the left side. - paths.Add(node._left); - break; - } - } - else - { - // Left side may be nullable, can't extend the prefix - set = _builder._solver.False; // Not going to be used - return false; - } - case SymbolicRegexKind.Or: - case SymbolicRegexKind.And: - Debug.Assert(node._alts is not null); - // Handle alternatives as separate paths - paths.AddRange(node._alts); - break; - default: - set = _builder._solver.False; // Not going to be used - return false; // Cut prefix immediately for unhandled node - } - } - set = _builder._solver.False; // Not going to be used - return false; - } /// Get the predicate that covers all elements that make some progress. internal S GetStartSet() => _startSet; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 83906ff029c1a..b0d9de74668a0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -12,10 +12,8 @@ internal sealed class SymbolicRegexRunnerFactory : RegexRunnerFactory /// The unicode component, including the BDD algebra. internal static readonly UnicodeCategoryTheory s_unicode = new UnicodeCategoryTheory(new CharSetSolver()); - /// The matching engine. - internal readonly SymbolicRegexMatcher _matcher; - /// Minimum length computed - private readonly int _minRequiredLength; + /// The matching engine, for 64 or fewer minterms. A SymbolicRegexMatcher of ulong or VB + internal readonly ISymbolicRegexMatcher _matcher; /// Initializes the factory. public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) @@ -32,23 +30,22 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan var solver = (CharSetSolver)s_unicode._solver; SymbolicRegexNode root = converter.Convert(code.Tree.Root, topLevel: true); - _minRequiredLength = code.Tree.MinRequiredLength; - BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BV to represent a predicate var algBV = new BVAlgebra(solver, minterms); - var builderBV = new SymbolicRegexBuilder(algBV); - - // The default constructor sets the following predicates to False; this update happens after the fact. - // It depends on whether anchors where used in the regex whether the predicates are actually different from False. - builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors); - builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate); + var builderBV = new SymbolicRegexBuilder(algBV) + { + // The default constructor sets the following predicates to False; this update happens after the fact. + // It depends on whether anchors where used in the regex whether the predicates are actually different from False. + _wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), + _newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate) + }; - //Convert the BDD based AST to BV based AST + // Convert the BDD-based AST to BV-based AST SymbolicRegexNode rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); - _matcher = new SymbolicRegexMatcher(rootBV, solver, minterms, matchTimeout, culture); + _matcher = new SymbolicRegexMatcher(rootBV, code, solver, minterms, matchTimeout, culture); } else { @@ -64,37 +61,31 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); - _matcher = new SymbolicRegexMatcher(root64, solver, minterms, matchTimeout, culture); + _matcher = new SymbolicRegexMatcher(root64, code, solver, minterms, matchTimeout, culture); } } /// Creates a object. - protected internal override RegexRunner CreateInstance() => new Runner(_matcher, _minRequiredLength); + protected internal override RegexRunner CreateInstance() => _matcher is SymbolicRegexMatcher srmUInt64 ? + new Runner(srmUInt64) : + new Runner((SymbolicRegexMatcher)_matcher); /// Runner type produced by this factory. /// - /// The wrapped is itself thread-safe and can be shared across + /// The wrapped is itself thread-safe and can be shared across /// all runner instances, but the runner itself has state (e.g. for captures, positions, etc.) /// and must not be shared between concurrent uses. /// - private sealed class Runner : RegexRunner + private sealed class Runner : RegexRunner where TSetType : notnull { /// The matching engine. - private readonly SymbolicRegexMatcher _matcher; - /// Minimum length computed. - private readonly int _minRequiredLength; + private readonly SymbolicRegexMatcher _matcher; - internal Runner(SymbolicRegexMatcher matcher, int minRequiredLength) - { - _matcher = matcher; - _minRequiredLength = minRequiredLength; - } + internal Runner(SymbolicRegexMatcher matcher) => _matcher = matcher; protected override void InitTrackCount() { } // nop, no backtracking - protected override bool FindFirstChar() => - // The real logic is all in Go. Here we simply validate if there's enough text remaining to possibly match. - runtextpos <= runtextend - _minRequiredLength; + protected override bool FindFirstChar() => true; // The logic is all in Go. protected override void Go() { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs index 3f965cec7cd93..8269e123a6f12 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs @@ -178,13 +178,11 @@ private IEnumerable> Step(List> states } private BDD ToBDD(S pred) => _solver.ConvertToCharSet(SymbolicRegexRunnerFactory.s_unicode._solver, pred); + private T Choose(IList elems) => elems[_random.Next(elems.Count)]; - private T Choose(IEnumerable elems) - { - List list = new List(elems); - return list[_random.Next(list.Count)]; - } + private char ChooseChar((uint, uint) pair) => (char)_random.Next((int)pair.Item1, (int)pair.Item2 + 1); + private char ChooseChar(BDD bdd) { Debug.Assert(!bdd.IsEmpty); @@ -192,8 +190,10 @@ private char ChooseChar(BDD bdd) BDD bdd1 = SymbolicRegexRunnerFactory.s_unicode._solver.And(bdd, _ascii); return ChooseChar(Choose(((CharSetSolver)SymbolicRegexRunnerFactory.s_unicode._solver).ToRanges(bdd1.IsEmpty ? bdd : bdd1))); } + private bool ChooseRandomlyTrueOrFalse() => _random.Next(100) < 50; /// Returns true if some state is unconditionally final + private bool IsFinal(IEnumerable> states) { foreach (SymbolicRegexNode state in states) @@ -205,6 +205,7 @@ private bool IsFinal(IEnumerable> states) } return false; } + /// Returns true if some state can be final private bool CanBeFinal(IEnumerable> states) { @@ -217,6 +218,7 @@ private bool CanBeFinal(IEnumerable> states) } return false; } + /// Returns true if some state is final in the given context private bool IsFinal(IEnumerable> states, uint context) { @@ -229,7 +231,9 @@ private bool IsFinal(IEnumerable> states, uint context) } return false; } + private bool IsWordchar(S pred) => _solver.IsSatisfiable(_solver.And(pred, _root._builder._wordLetterPredicateForAnchors)); + private bool IsNewline(S pred) => _solver.IsSatisfiable(_solver.And(pred, _root._builder._newLinePredicate)); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs index 73b7249b408df..5900e44a29738 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs @@ -1,11 +1,13 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics.CodeAnalysis; using System.IO; namespace System.Text.RegularExpressions.Symbolic.Unicode { #if DEBUG + [ExcludeFromCodeCoverage] internal static class GeneratorHelper { public static void WriteInt64ArrayInitSyntax(StreamWriter sw, long[] values) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs index 00098b703b963..24d4ae4541372 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs @@ -3,12 +3,14 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; namespace System.Text.RegularExpressions.Symbolic.Unicode { #if DEBUG + [ExcludeFromCodeCoverage] internal static class IgnoreCaseRelationGenerator { private const string DefaultCultureName = "en-US"; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs index 32b09bf1d14ae..c3ced759b0222 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; @@ -10,6 +11,7 @@ namespace System.Text.RegularExpressions.Symbolic.Unicode { #if DEBUG /// Utility for generating unicode category ranges and corresponing binary decision diagrams. + [ExcludeFromCodeCoverage] internal static class UnicodeCategoryRangesGenerator { /// Generator for BDD Unicode category definitions. @@ -88,6 +90,7 @@ private static void WriteSerializedBDDs(StreamWriter sw) } /// Used internally for creating a collection of ranges for serialization. + [ExcludeFromCodeCoverage] internal sealed class Ranges { public readonly List ranges = new List(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs index 247c60ff46bd3..e15d49c78ff4c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -114,5 +114,21 @@ public static TResult CallOnEmptyStack(Func func(arg1, arg2, arg3)) .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + /// The third argument to pass to the function. + /// The fourth argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => + Task.Run(() => func(arg1, arg2, arg3, arg4)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs index 5b43fab690865..3204d7a989268 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs @@ -13,119 +13,121 @@ public class RegexGroupTests { public static IEnumerable Groups_Basic_TestData() { - // (A - B) B is a subset of A(ie B only contains chars that are in A) - yield return new object[] { null, "[abcd-[d]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + // (A - B) B is a subset of A(ie B only contains chars that are in A) + yield return new object[] { engine, null, "[abcd-[d]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; - yield return new object[] { null, @"[\d-[357]]+", "33312468955", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[357]]+", "51246897", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[357]]+", "3312468977", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357]]+", "33312468955", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357]]+", "51246897", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357]]+", "3312468977", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\w-[b-y]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\w-[\d]]+", "0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; - yield return new object[] { null, @"[\w-[\p{Ll}]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; + yield return new object[] { engine, null, @"[\w-[\d]]+", "0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; + yield return new object[] { engine, null, @"[\w-[\p{Ll}]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; - yield return new object[] { null, @"[\d-[13579]]+", "1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; + yield return new object[] { engine, null, @"[\d-[13579]]+", "1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; - yield return new object[] { null, @"[\p{Ll}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\p{Nd}-[2468]]+", "20135798", RegexOptions.None, new string[] { "013579" } }; + yield return new object[] { engine, null, @"[\p{Ll}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\p{Nd}-[2468]]+", "20135798", RegexOptions.None, new string[] { "013579" } }; - yield return new object[] { null, @"[\P{Lu}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\P{Nd}-[\p{Ll}]]+", "az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[\p{Ll}]]+", "az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; - // (A - B) B is a superset of A (ie B contains chars that are in A plus other chars that are not in A) - yield return new object[] { null, "[abcd-[def]]+", "fedddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; + // (A - B) B is a superset of A (ie B contains chars that are in A plus other chars that are not in A) + yield return new object[] { engine, null, "[abcd-[def]]+", "fedddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; - yield return new object[] { null, @"[\d-[357a-z]]+", "az33312468955", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[de357fgA-Z]]+", "AZ51246897", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[357\p{Ll}]]+", "az3312468977", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357a-z]]+", "az33312468955", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[de357fgA-Z]]+", "AZ51246897", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357\p{Ll}]]+", "az3312468977", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\w-[b-y\s]]+", " \tbbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y\s]]+", " \tbbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\w-[\d\p{Po}]]+", "!#0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; - yield return new object[] { null, @"[\w-[\p{Ll}\s]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; + yield return new object[] { engine, null, @"[\w-[\d\p{Po}]]+", "!#0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; + yield return new object[] { engine, null, @"[\w-[\p{Ll}\s]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; - yield return new object[] { null, @"[\d-[13579a-zA-Z]]+", "AZ1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579abcd]]+", "abcd\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579\s]]+", " \t\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; + yield return new object[] { engine, null, @"[\d-[13579a-zA-Z]]+", "AZ1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579abcd]]+", "abcd\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579\s]]+", " \t\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; - yield return new object[] { null, @"[\w-[b-y\p{Po}]]+", "!#bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y\p{Po}]]+", "!#bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\w-[b-y!.,]]+", "!.,bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, "[\\w-[b-y\x00-\x0F]]+", "\0bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y!.,]]+", "!.,bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, "[\\w-[b-y\x00-\x0F]]+", "\0bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\p{Ll}-[ae-z0-9]]+", "09aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\p{Nd}-[2468az]]+", "az20135798", RegexOptions.None, new string[] { "013579" } }; + yield return new object[] { engine, null, @"[\p{Ll}-[ae-z0-9]]+", "09aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\p{Nd}-[2468az]]+", "az20135798", RegexOptions.None, new string[] { "013579" } }; - yield return new object[] { null, @"[\P{Lu}-[ae-zA-Z]]+", "AZaaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\P{Nd}-[\p{Ll}0123456789]]+", "09az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[ae-zA-Z]]+", "AZaaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[\p{Ll}0123456789]]+", "09az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; - // (A - B) B only contains chars that are not in A - yield return new object[] { null, "[abc-[defg]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; + // (A - B) B only contains chars that are not in A + yield return new object[] { engine, null, "[abc-[defg]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; - yield return new object[] { null, @"[\d-[abc]]+", "abc09abc", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\d-[a-zA-Z]]+", "az09AZ", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\d-[\p{Ll}]]+", "az09az", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\d-[abc]]+", "abc09abc", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\d-[a-zA-Z]]+", "az09AZ", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\d-[\p{Ll}]]+", "az09az", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\w-[\x00-\x0F]]+", "bbbaaaABYZ09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABYZ09zzzyyy" } }; + yield return new object[] { engine, null, @"[\w-[\x00-\x0F]]+", "bbbaaaABYZ09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABYZ09zzzyyy" } }; - yield return new object[] { null, @"[\w-[\s]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; - yield return new object[] { null, @"[\w-[\W]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; - yield return new object[] { null, @"[\w-[\p{Po}]]+", "#a09AZz!", RegexOptions.None, new string[] { "a09AZz" } }; + yield return new object[] { engine, null, @"[\w-[\s]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; + yield return new object[] { engine, null, @"[\w-[\W]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; + yield return new object[] { engine, null, @"[\w-[\p{Po}]]+", "#a09AZz!", RegexOptions.None, new string[] { "a09AZz" } }; - yield return new object[] { null, @"[\d-[\D]]+", "azAZ1024689", RegexOptions.ECMAScript, new string[] { "1024689" } }; - yield return new object[] { null, @"[\d-[a-zA-Z]]+", "azAZ\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[\p{Ll}]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; + yield return new object[] { engine, null, @"[\d-[\D]]+", "azAZ1024689", RegexOptions.ECMAScript, new string[] { "1024689" } }; + yield return new object[] { engine, null, @"[\d-[a-zA-Z]]+", "azAZ\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[\p{Ll}]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; - yield return new object[] { null, @"[a-zA-Z0-9-[\s]]+", " \tazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; + yield return new object[] { engine, null, @"[a-zA-Z0-9-[\s]]+", " \tazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; - yield return new object[] { null, @"[a-zA-Z0-9-[\W]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; - yield return new object[] { null, @"[a-zA-Z0-9-[^a-zA-Z0-9]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; + yield return new object[] { engine, null, @"[a-zA-Z0-9-[\W]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; + yield return new object[] { engine, null, @"[a-zA-Z0-9-[^a-zA-Z0-9]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; - yield return new object[] { null, @"[\p{Ll}-[A-Z]]+", "AZaz09", RegexOptions.None, new string[] { "az" } }; - yield return new object[] { null, @"[\p{Nd}-[a-z]]+", "az09", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\p{Ll}-[A-Z]]+", "AZaz09", RegexOptions.None, new string[] { "az" } }; + yield return new object[] { engine, null, @"[\p{Nd}-[a-z]]+", "az09", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\P{Lu}-[\p{Lu}]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; - yield return new object[] { null, @"[\P{Lu}-[A-Z]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; - yield return new object[] { null, @"[\P{Nd}-[\p{Nd}]]+", "azAZ09", RegexOptions.None, new string[] { "azAZ" } }; - yield return new object[] { null, @"[\P{Nd}-[2-8]]+", "1234567890azAZ1234567890", RegexOptions.None, new string[] { "azAZ" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[\p{Lu}]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[A-Z]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[\p{Nd}]]+", "azAZ09", RegexOptions.None, new string[] { "azAZ" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[2-8]]+", "1234567890azAZ1234567890", RegexOptions.None, new string[] { "azAZ" } }; - // Alternating construct - yield return new object[] { null, @"([ ]|[\w-[0-9]])+", "09az AZ90", RegexOptions.None, new string[] { "az AZ", "Z" } }; - yield return new object[] { null, @"([0-9-[02468]]|[0-9-[13579]])+", "az1234567890za", RegexOptions.None, new string[] { "1234567890", "0" } }; - yield return new object[] { null, @"([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+", "azBCDE1234567890BCDEFza", RegexOptions.None, new string[] { "BCDE1234567890BCDE", "E" } }; - yield return new object[] { null, @"([\p{Ll}-[aeiou]]|[^\w-[\s]])+", "aeiobcdxyz!@#aeio", RegexOptions.None, new string[] { "bcdxyz!@#", "#" } }; - yield return new object[] { null, @"(?:hello|hi){1,3}", "hello", RegexOptions.None, new string[] { "hello" } }; - yield return new object[] { null, @"(hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi", "hi" } }; - yield return new object[] { null, @"(?:hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; - yield return new object[] { null, @"(?:hello|hi){2,2}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; - yield return new object[] { null, @"(?:hello|hi){2,2}?", "hellohihihello", RegexOptions.None, new string[] { "hellohi" } }; - yield return new object[] { null, @"(?:abc|def|ghi|hij|klm|no){1,4}", "this is a test nonoabcxyz this is only a test", RegexOptions.None, new string[] { "nonoabc" } }; - yield return new object[] { null, @"xyz(abc|def)xyz", "abcxyzdefxyzabc", RegexOptions.None, new string[] { "xyzdefxyz", "def" } }; - yield return new object[] { null, @"abc|(?:def|ghi)", "ghi", RegexOptions.None, new string[] { "ghi" } }; - yield return new object[] { null, @"abc|(def|ghi)", "def", RegexOptions.None, new string[] { "def", "def" } }; + // Alternating construct + yield return new object[] { engine, null, @"([ ]|[\w-[0-9]])+", "09az AZ90", RegexOptions.None, new string[] { "az AZ", "Z" } }; + yield return new object[] { engine, null, @"([0-9-[02468]]|[0-9-[13579]])+", "az1234567890za", RegexOptions.None, new string[] { "1234567890", "0" } }; + yield return new object[] { engine, null, @"([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+", "azBCDE1234567890BCDEFza", RegexOptions.None, new string[] { "BCDE1234567890BCDE", "E" } }; + yield return new object[] { engine, null, @"([\p{Ll}-[aeiou]]|[^\w-[\s]])+", "aeiobcdxyz!@#aeio", RegexOptions.None, new string[] { "bcdxyz!@#", "#" } }; + yield return new object[] { engine, null, @"(?:hello|hi){1,3}", "hello", RegexOptions.None, new string[] { "hello" } }; + yield return new object[] { engine, null, @"(hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi", "hi" } }; + yield return new object[] { engine, null, @"(?:hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; + yield return new object[] { engine, null, @"(?:hello|hi){2,2}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; + yield return new object[] { engine, null, @"(?:hello|hi){2,2}?", "hellohihihello", RegexOptions.None, new string[] { "hellohi" } }; + yield return new object[] { engine, null, @"(?:abc|def|ghi|hij|klm|no){1,4}", "this is a test nonoabcxyz this is only a test", RegexOptions.None, new string[] { "nonoabc" } }; + yield return new object[] { engine, null, @"xyz(abc|def)xyz", "abcxyzdefxyzabc", RegexOptions.None, new string[] { "xyzdefxyz", "def" } }; + yield return new object[] { engine, null, @"abc|(?:def|ghi)", "ghi", RegexOptions.None, new string[] { "ghi" } }; + yield return new object[] { engine, null, @"abc|(def|ghi)", "def", RegexOptions.None, new string[] { "def", "def" } }; - // Multiple character classes using character class subtraction - yield return new object[] { null, @"98[\d-[9]][\d-[8]][\d-[0]]", "98911 98881 98870 98871", RegexOptions.None, new string[] { "98871" } }; - yield return new object[] { null, @"m[\w-[^aeiou]][\w-[^aeiou]]t", "mbbt mect meet", RegexOptions.None, new string[] { "meet" } }; + // Multiple character classes using character class subtraction + yield return new object[] { engine, null, @"98[\d-[9]][\d-[8]][\d-[0]]", "98911 98881 98870 98871", RegexOptions.None, new string[] { "98871" } }; + yield return new object[] { engine, null, @"m[\w-[^aeiou]][\w-[^aeiou]]t", "mbbt mect meet", RegexOptions.None, new string[] { "meet" } }; - // Negation with character class subtraction - yield return new object[] { null, "[abcdef-[^bce]]+", "adfbcefda", RegexOptions.None, new string[] { "bce" } }; - yield return new object[] { null, "[^cde-[ag]]+", "agbfxyzga", RegexOptions.None, new string[] { "bfxyz" } }; + // Negation with character class subtraction + yield return new object[] { engine, null, "[abcdef-[^bce]]+", "adfbcefda", RegexOptions.None, new string[] { "bce" } }; + yield return new object[] { engine, null, "[^cde-[ag]]+", "agbfxyzga", RegexOptions.None, new string[] { "bfxyz" } }; - // Misc The idea here is come up with real world examples of char class subtraction. Things that - // would be difficult to define without it - yield return new object[] { null, @"[\p{L}-[^\p{Lu}]]+", "09',.abcxyzABCXYZ", RegexOptions.None, new string[] { "ABCXYZ" } }; + // Misc The idea here is come up with real world examples of char class subtraction. Things that + // would be difficult to define without it + yield return new object[] { engine, null, @"[\p{L}-[^\p{Lu}]]+", "09',.abcxyzABCXYZ", RegexOptions.None, new string[] { "ABCXYZ" } }; - yield return new object[] { null, @"[\p{IsGreek}-[\P{Lu}]]+", "\u0390\u03FE\u0386\u0388\u03EC\u03EE\u0400", RegexOptions.None, new string[] { "\u03FE\u0386\u0388\u03EC\u03EE" } }; - yield return new object[] { null, @"[\p{IsBasicLatin}-[G-L]]+", "GAFMZL", RegexOptions.None, new string[] { "AFMZ" } }; + yield return new object[] { engine, null, @"[\p{IsGreek}-[\P{Lu}]]+", "\u0390\u03FE\u0386\u0388\u03EC\u03EE\u0400", RegexOptions.None, new string[] { "\u03FE\u0386\u0388\u03EC\u03EE" } }; + yield return new object[] { engine, null, @"[\p{IsBasicLatin}-[G-L]]+", "GAFMZL", RegexOptions.None, new string[] { "AFMZ" } }; - yield return new object[] { null, "[a-zA-Z-[aeiouAEIOU]]+", "aeiouAEIOUbcdfghjklmnpqrstvwxyz", RegexOptions.None, new string[] { "bcdfghjklmnpqrstvwxyz" } }; + yield return new object[] { engine, null, "[a-zA-Z-[aeiouAEIOU]]+", "aeiouAEIOUbcdfghjklmnpqrstvwxyz", RegexOptions.None, new string[] { "bcdfghjklmnpqrstvwxyz" } }; - // The following is an overly complex way of matching an ip address using char class subtraction - yield return new object[] { null, @"^ + // The following is an overly complex way of matching an ip address using char class subtraction + yield return new object[] { engine, null, @"^ (?^ ( ( @@ -157,370 +159,370 @@ public static IEnumerable Groups_Basic_TestData() )$" , "255", RegexOptions.IgnorePatternWhitespace, new string[] { "255", "255", "2", "5", "5", "", "255", "2", "5" } }; - // Character Class Substraction - yield return new object[] { null, @"[abcd\-d-[bc]]+", "bbbaaa---dddccc", RegexOptions.None, new string[] { "aaa---ddd" } }; - yield return new object[] { null, @"[^a-f-[\x00-\x60\u007B-\uFFFF]]+", "aaafffgggzzz{{{", RegexOptions.None, new string[] { "gggzzz" } }; - yield return new object[] { null, @"[\[\]a-f-[[]]+", "gggaaafff]]][[[", RegexOptions.None, new string[] { "aaafff]]]" } }; - yield return new object[] { null, @"[\[\]a-f-[]]]+", "gggaaafff[[[]]]", RegexOptions.None, new string[] { "aaafff[[[" } }; - - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; - - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "-]]", RegexOptions.None, new string[] { "-]]" } }; - - yield return new object[] { null, @"[a-[c-e]]+", "bbbaaaccc", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[a-[c-e]]+", "```aaaccc", RegexOptions.None, new string[] { "aaa" } }; - - yield return new object[] { null, @"[a-d\--[bc]]+", "cccaaa--dddbbb", RegexOptions.None, new string[] { "aaa--ddd" } }; - - // Not Character class substraction - yield return new object[] { null, @"[\0- [bc]+", "!!!\0\0\t\t [[[[bbbcccaaa", RegexOptions.None, new string[] { "\0\0\t\t [[[[bbbccc" } }; - yield return new object[] { null, "[[abcd]-[bc]]+", "a-b]", RegexOptions.None, new string[] { "a-b]" } }; - yield return new object[] { null, "[-[e-g]+", "ddd[[[---eeefffggghhh", RegexOptions.None, new string[] { "[[[---eeefffggg" } }; - yield return new object[] { null, "[-e-g]+", "ddd---eeefffggghhh", RegexOptions.None, new string[] { "---eeefffggg" } }; - yield return new object[] { null, "[a-e - m-p]+", "---a b c d e m n o p---", RegexOptions.None, new string[] { "a b c d e m n o p" } }; - yield return new object[] { null, "[^-[bc]]", "b] c] -] aaaddd]", RegexOptions.None, new string[] { "d]" } }; - yield return new object[] { null, "[^-[bc]]", "b] c] -] aaa]ddd]", RegexOptions.None, new string[] { "a]" } }; - - // Make sure we correctly handle \- - yield return new object[] { null, @"[a\-[bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; - yield return new object[] { null, @"[a\-[\-\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; - yield return new object[] { null, @"[a\-\[\-\[\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; - yield return new object[] { null, @"[abc\--[b]]+", "[[[```bbbaaa---cccddd", RegexOptions.None, new string[] { "aaa---ccc" } }; - yield return new object[] { null, @"[abc\-z-[b]]+", "```aaaccc---zzzbbb", RegexOptions.None, new string[] { "aaaccc---zzz" } }; - yield return new object[] { null, @"[a-d\-[b]+", "```aaabbbcccddd----[[[[]]]", RegexOptions.None, new string[] { "aaabbbcccddd----[[[[" } }; - yield return new object[] { null, @"[abcd\-d\-[bc]+", "bbbaaa---[[[dddccc", RegexOptions.None, new string[] { "bbbaaa---[[[dddccc" } }; - - // Everything works correctly with option RegexOptions.IgnorePatternWhitespace - yield return new object[] { null, "[a - c - [ b ] ]+", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { " ]]]" } }; - yield return new object[] { null, "[a - c - [ b ] +", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { "aaa ccc [[[[ bbb " } }; - - // Unicode Char Classes - yield return new object[] { null, @"(\p{Lu}\w*)\s(\p{Lu}\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"(\p{Lu}\p{Ll}*)\s(\p{Lu}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"(\P{Ll}\p{Ll}*)\s(\P{Ll}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"(\P{Lu}+\p{Lu})\s(\P{Lu}+\p{Lu})", "hellO worlD", RegexOptions.None, new string[] { "hellO worlD", "hellO", "worlD" } }; - yield return new object[] { null, @"(\p{Lt}\w*)\s(\p{Lt}*\w*)", "\u01C5ello \u01C5orld", RegexOptions.None, new string[] { "\u01C5ello \u01C5orld", "\u01C5ello", "\u01C5orld" } }; - yield return new object[] { null, @"(\P{Lt}\w*)\s(\P{Lt}*\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - - // Character ranges IgnoreCase - yield return new object[] { null, @"[@-D]+", "eE?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { "@ABCDabcd" } }; - yield return new object[] { null, @"[>-D]+", "eE=>?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { ">?@ABCDabcd" } }; - yield return new object[] { null, @"[\u0554-\u0557]+", "\u0583\u0553\u0554\u0555\u0556\u0584\u0585\u0586\u0557\u0558", RegexOptions.IgnoreCase, new string[] { "\u0554\u0555\u0556\u0584\u0585\u0586\u0557" } }; - yield return new object[] { null, @"[X-\]]+", "wWXYZxyz[\\]^", RegexOptions.IgnoreCase, new string[] { "XYZxyz[\\]" } }; - yield return new object[] { null, @"[X-\u0533]+", "\u0551\u0554\u0560AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563\u0564", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563" } }; - yield return new object[] { null, @"[X-a]+", "wWAXYZaxyz", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz" } }; - yield return new object[] { null, @"[X-c]+", "wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "ABCXYZabcxyz" } }; - yield return new object[] { null, @"[X-\u00C0]+", "\u00C1\u00E1\u00C0\u00E0wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "\u00C0\u00E0wWABCXYZabcxyz" } }; - yield return new object[] { null, @"[\u0100\u0102\u0104]+", "\u00FF \u0100\u0102\u0104\u0101\u0103\u0105\u0106", RegexOptions.IgnoreCase, new string[] { "\u0100\u0102\u0104\u0101\u0103\u0105" } }; - yield return new object[] { null, @"[B-D\u0130]+", "aAeE\u0129\u0131\u0068 BCDbcD\u0130\u0069\u0070", RegexOptions.IgnoreCase, new string[] { "BCDbcD\u0130\u0069" } }; - yield return new object[] { null, @"[\u013B\u013D\u013F]+", "\u013A\u013B\u013D\u013F\u013C\u013E\u0140\u0141", RegexOptions.IgnoreCase, new string[] { "\u013B\u013D\u013F\u013C\u013E\u0140" } }; - - // Escape Chars - yield return new object[] { null, "(Cat)\r(Dog)", "Cat\rDog", RegexOptions.None, new string[] { "Cat\rDog", "Cat", "Dog" } }; - yield return new object[] { null, "(Cat)\t(Dog)", "Cat\tDog", RegexOptions.None, new string[] { "Cat\tDog", "Cat", "Dog" } }; - yield return new object[] { null, "(Cat)\f(Dog)", "Cat\fDog", RegexOptions.None, new string[] { "Cat\fDog", "Cat", "Dog" } }; - - // Miscellaneous { witout matching } - yield return new object[] { null, @"{5", "hello {5 world", RegexOptions.None, new string[] { "{5" } }; - yield return new object[] { null, @"{5,", "hello {5, world", RegexOptions.None, new string[] { "{5," } }; - yield return new object[] { null, @"{5,6", "hello {5,6 world", RegexOptions.None, new string[] { "{5,6" } }; - - // Miscellaneous inline options - yield return new object[] { null, @"(?n:(?cat)(\s+)(?dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(?n:(cat)(\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog" } }; - yield return new object[] { null, @"(?n:(cat)(?\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"(?x: + // Character Class Substraction + yield return new object[] { engine, null, @"[abcd\-d-[bc]]+", "bbbaaa---dddccc", RegexOptions.None, new string[] { "aaa---ddd" } }; + yield return new object[] { engine, null, @"[^a-f-[\x00-\x60\u007B-\uFFFF]]+", "aaafffgggzzz{{{", RegexOptions.None, new string[] { "gggzzz" } }; + yield return new object[] { engine, null, @"[\[\]a-f-[[]]+", "gggaaafff]]][[[", RegexOptions.None, new string[] { "aaafff]]]" } }; + yield return new object[] { engine, null, @"[\[\]a-f-[]]]+", "gggaaafff[[[]]]", RegexOptions.None, new string[] { "aaafff[[[" } }; + + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; + + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "-]]", RegexOptions.None, new string[] { "-]]" } }; + + yield return new object[] { engine, null, @"[a-[c-e]]+", "bbbaaaccc", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[a-[c-e]]+", "```aaaccc", RegexOptions.None, new string[] { "aaa" } }; + + yield return new object[] { engine, null, @"[a-d\--[bc]]+", "cccaaa--dddbbb", RegexOptions.None, new string[] { "aaa--ddd" } }; + + // Not Character class substraction + yield return new object[] { engine, null, @"[\0- [bc]+", "!!!\0\0\t\t [[[[bbbcccaaa", RegexOptions.None, new string[] { "\0\0\t\t [[[[bbbccc" } }; + yield return new object[] { engine, null, "[[abcd]-[bc]]+", "a-b]", RegexOptions.None, new string[] { "a-b]" } }; + yield return new object[] { engine, null, "[-[e-g]+", "ddd[[[---eeefffggghhh", RegexOptions.None, new string[] { "[[[---eeefffggg" } }; + yield return new object[] { engine, null, "[-e-g]+", "ddd---eeefffggghhh", RegexOptions.None, new string[] { "---eeefffggg" } }; + yield return new object[] { engine, null, "[a-e - m-p]+", "---a b c d e m n o p---", RegexOptions.None, new string[] { "a b c d e m n o p" } }; + yield return new object[] { engine, null, "[^-[bc]]", "b] c] -] aaaddd]", RegexOptions.None, new string[] { "d]" } }; + yield return new object[] { engine, null, "[^-[bc]]", "b] c] -] aaa]ddd]", RegexOptions.None, new string[] { "a]" } }; + + // Make sure we correctly handle \- + yield return new object[] { engine, null, @"[a\-[bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; + yield return new object[] { engine, null, @"[a\-[\-\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; + yield return new object[] { engine, null, @"[a\-\[\-\[\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; + yield return new object[] { engine, null, @"[abc\--[b]]+", "[[[```bbbaaa---cccddd", RegexOptions.None, new string[] { "aaa---ccc" } }; + yield return new object[] { engine, null, @"[abc\-z-[b]]+", "```aaaccc---zzzbbb", RegexOptions.None, new string[] { "aaaccc---zzz" } }; + yield return new object[] { engine, null, @"[a-d\-[b]+", "```aaabbbcccddd----[[[[]]]", RegexOptions.None, new string[] { "aaabbbcccddd----[[[[" } }; + yield return new object[] { engine, null, @"[abcd\-d\-[bc]+", "bbbaaa---[[[dddccc", RegexOptions.None, new string[] { "bbbaaa---[[[dddccc" } }; + + // Everything works correctly with option RegexOptions.IgnorePatternWhitespace + yield return new object[] { engine, null, "[a - c - [ b ] ]+", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { " ]]]" } }; + yield return new object[] { engine, null, "[a - c - [ b ] +", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { "aaa ccc [[[[ bbb " } }; + + // Unicode Char Classes + yield return new object[] { engine, null, @"(\p{Lu}\w*)\s(\p{Lu}\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"(\p{Lu}\p{Ll}*)\s(\p{Lu}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"(\P{Ll}\p{Ll}*)\s(\P{Ll}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"(\P{Lu}+\p{Lu})\s(\P{Lu}+\p{Lu})", "hellO worlD", RegexOptions.None, new string[] { "hellO worlD", "hellO", "worlD" } }; + yield return new object[] { engine, null, @"(\p{Lt}\w*)\s(\p{Lt}*\w*)", "\u01C5ello \u01C5orld", RegexOptions.None, new string[] { "\u01C5ello \u01C5orld", "\u01C5ello", "\u01C5orld" } }; + yield return new object[] { engine, null, @"(\P{Lt}\w*)\s(\P{Lt}*\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + + // Character ranges IgnoreCase + yield return new object[] { engine, null, @"[@-D]+", "eE?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { "@ABCDabcd" } }; + yield return new object[] { engine, null, @"[>-D]+", "eE=>?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { ">?@ABCDabcd" } }; + yield return new object[] { engine, null, @"[\u0554-\u0557]+", "\u0583\u0553\u0554\u0555\u0556\u0584\u0585\u0586\u0557\u0558", RegexOptions.IgnoreCase, new string[] { "\u0554\u0555\u0556\u0584\u0585\u0586\u0557" } }; + yield return new object[] { engine, null, @"[X-\]]+", "wWXYZxyz[\\]^", RegexOptions.IgnoreCase, new string[] { "XYZxyz[\\]" } }; + yield return new object[] { engine, null, @"[X-\u0533]+", "\u0551\u0554\u0560AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563\u0564", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563" } }; + yield return new object[] { engine, null, @"[X-a]+", "wWAXYZaxyz", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz" } }; + yield return new object[] { engine, null, @"[X-c]+", "wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "ABCXYZabcxyz" } }; + yield return new object[] { engine, null, @"[X-\u00C0]+", "\u00C1\u00E1\u00C0\u00E0wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "\u00C0\u00E0wWABCXYZabcxyz" } }; + yield return new object[] { engine, null, @"[\u0100\u0102\u0104]+", "\u00FF \u0100\u0102\u0104\u0101\u0103\u0105\u0106", RegexOptions.IgnoreCase, new string[] { "\u0100\u0102\u0104\u0101\u0103\u0105" } }; + yield return new object[] { engine, null, @"[B-D\u0130]+", "aAeE\u0129\u0131\u0068 BCDbcD\u0130\u0069\u0070", RegexOptions.IgnoreCase, new string[] { "BCDbcD\u0130\u0069" } }; + yield return new object[] { engine, null, @"[\u013B\u013D\u013F]+", "\u013A\u013B\u013D\u013F\u013C\u013E\u0140\u0141", RegexOptions.IgnoreCase, new string[] { "\u013B\u013D\u013F\u013C\u013E\u0140" } }; + + // Escape Chars + yield return new object[] { engine, null, "(Cat)\r(Dog)", "Cat\rDog", RegexOptions.None, new string[] { "Cat\rDog", "Cat", "Dog" } }; + yield return new object[] { engine, null, "(Cat)\t(Dog)", "Cat\tDog", RegexOptions.None, new string[] { "Cat\tDog", "Cat", "Dog" } }; + yield return new object[] { engine, null, "(Cat)\f(Dog)", "Cat\fDog", RegexOptions.None, new string[] { "Cat\fDog", "Cat", "Dog" } }; + + // Miscellaneous { witout matching } + yield return new object[] { engine, null, @"{5", "hello {5 world", RegexOptions.None, new string[] { "{5" } }; + yield return new object[] { engine, null, @"{5,", "hello {5, world", RegexOptions.None, new string[] { "{5," } }; + yield return new object[] { engine, null, @"{5,6", "hello {5,6 world", RegexOptions.None, new string[] { "{5,6" } }; + + // Miscellaneous inline options + yield return new object[] { engine, null, @"(?n:(?cat)(\s+)(?dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?n:(cat)(\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog" } }; + yield return new object[] { engine, null, @"(?n:(cat)(?\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"(?x: (?cat) # Cat statement (\s+) # Whitespace chars (?dog # Dog statement ))", "cat dog", RegexOptions.None, new string[] { "cat dog", " ", "cat", "dog" } }; - yield return new object[] { null, @"(?+i:cat)", "CAT", RegexOptions.None, new string[] { "CAT" } }; - - // \d, \D, \s, \S, \w, \W, \P, \p inside character range - yield return new object[] { null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.None, new string[] { "cat230927dog", "230927" } }; - yield return new object[] { null, @"([\D]*)dog", "65498catdog58719", RegexOptions.None, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - - // \x, \u, \a, \b, \e, \f, \n, \r, \t, \v, \c, inside character range - yield return new object[] { null, @"(cat)([\x41]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; - yield return new object[] { null, @"(cat)([\u0041]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; - yield return new object[] { null, @"(cat)([\a]*)(dog)", "cat\a\a\adog", RegexOptions.None, new string[] { "cat\a\a\adog", "cat", "\a\a\a", "dog" } }; - yield return new object[] { null, @"(cat)([\b]*)(dog)", "cat\b\b\bdog", RegexOptions.None, new string[] { "cat\b\b\bdog", "cat", "\b\b\b", "dog" } }; - yield return new object[] { null, @"(cat)([\e]*)(dog)", "cat\u001B\u001B\u001Bdog", RegexOptions.None, new string[] { "cat\u001B\u001B\u001Bdog", "cat", "\u001B\u001B\u001B", "dog" } }; - yield return new object[] { null, @"(cat)([\f]*)(dog)", "cat\f\f\fdog", RegexOptions.None, new string[] { "cat\f\f\fdog", "cat", "\f\f\f", "dog" } }; - yield return new object[] { null, @"(cat)([\r]*)(dog)", "cat\r\r\rdog", RegexOptions.None, new string[] { "cat\r\r\rdog", "cat", "\r\r\r", "dog" } }; - yield return new object[] { null, @"(cat)([\v]*)(dog)", "cat\v\v\vdog", RegexOptions.None, new string[] { "cat\v\v\vdog", "cat", "\v\v\v", "dog" } }; - - // \d, \D, \s, \S, \w, \W, \P, \p inside character range ([0-5]) with ECMA Option - yield return new object[] { null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "230927" } }; - yield return new object[] { null, @"([\D]*)dog", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; - - // \d, \D, \s, \S, \w, \W, \P, \p outside character range ([0-5]) with ECMA Option - yield return new object[] { null, @"(cat)\d*dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "cat" } }; - yield return new object[] { null, @"\D*(dog)", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"(cat)\s*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\S*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"(cat)\w*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"(cat)\W*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"\p{Lu}(\w*)\s\p{Lu}(\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "ello", "orld" } }; - yield return new object[] { null, @"\P{Ll}\p{Ll}*\s\P{Ll}\p{Ll}*", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World" } }; - - // Use < in a group - yield return new object[] { null, @"cat(?dog)", "catcatdogdogcat", RegexOptions.None, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"(?cat)\s*(?dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; - yield return new object[] { null, @"(?<1>cat)\s*(?<1>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; - yield return new object[] { null, @"(?<2048>cat)\s*(?<2048>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; - yield return new object[] { null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - yield return new object[] { null, @"(?cat)\w+(?<-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "" } }; - yield return new object[] { null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "_Hello_World_" } }; - yield return new object[] { null, @"(?<1>cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - yield return new object[] { null, @"(?cat)\w+(?<2-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - yield return new object[] { null, @"(?<1>cat)\w+(?<2-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - - // Quantifiers - yield return new object[] { null, @"(?cat){", "STARTcat{", RegexOptions.None, new string[] { "cat{", "cat" } }; - yield return new object[] { null, @"(?cat){fdsa", "STARTcat{fdsa", RegexOptions.None, new string[] { "cat{fdsa", "cat" } }; - yield return new object[] { null, @"(?cat){1", "STARTcat{1", RegexOptions.None, new string[] { "cat{1", "cat" } }; - yield return new object[] { null, @"(?cat){1END", "STARTcat{1END", RegexOptions.None, new string[] { "cat{1END", "cat" } }; - yield return new object[] { null, @"(?cat){1,", "STARTcat{1,", RegexOptions.None, new string[] { "cat{1,", "cat" } }; - yield return new object[] { null, @"(?cat){1,END", "STARTcat{1,END", RegexOptions.None, new string[] { "cat{1,END", "cat" } }; - yield return new object[] { null, @"(?cat){1,2", "STARTcat{1,2", RegexOptions.None, new string[] { "cat{1,2", "cat" } }; - yield return new object[] { null, @"(?cat){1,2END", "STARTcat{1,2END", RegexOptions.None, new string[] { "cat{1,2END", "cat" } }; - - // Use IgnorePatternWhitespace - yield return new object[] { null, @"(cat) #cat + yield return new object[] { engine, null, @"(?+i:cat)", "CAT", RegexOptions.None, new string[] { "CAT" } }; + + // \d, \D, \s, \S, \w, \W, \P, \p inside character range + yield return new object[] { engine, null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.None, new string[] { "cat230927dog", "230927" } }; + yield return new object[] { engine, null, @"([\D]*)dog", "65498catdog58719", RegexOptions.None, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + + // \x, \u, \a, \b, \e, \f, \n, \r, \t, \v, \c, inside character range + yield return new object[] { engine, null, @"(cat)([\x41]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\u0041]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\a]*)(dog)", "cat\a\a\adog", RegexOptions.None, new string[] { "cat\a\a\adog", "cat", "\a\a\a", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\b]*)(dog)", "cat\b\b\bdog", RegexOptions.None, new string[] { "cat\b\b\bdog", "cat", "\b\b\b", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\e]*)(dog)", "cat\u001B\u001B\u001Bdog", RegexOptions.None, new string[] { "cat\u001B\u001B\u001Bdog", "cat", "\u001B\u001B\u001B", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\f]*)(dog)", "cat\f\f\fdog", RegexOptions.None, new string[] { "cat\f\f\fdog", "cat", "\f\f\f", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\r]*)(dog)", "cat\r\r\rdog", RegexOptions.None, new string[] { "cat\r\r\rdog", "cat", "\r\r\r", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\v]*)(dog)", "cat\v\v\vdog", RegexOptions.None, new string[] { "cat\v\v\vdog", "cat", "\v\v\v", "dog" } }; + + // \d, \D, \s, \S, \w, \W, \P, \p inside character range ([0-5]) with ECMA Option + yield return new object[] { engine, null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "230927" } }; + yield return new object[] { engine, null, @"([\D]*)dog", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; + + // \d, \D, \s, \S, \w, \W, \P, \p outside character range ([0-5]) with ECMA Option + yield return new object[] { engine, null, @"(cat)\d*dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "cat" } }; + yield return new object[] { engine, null, @"\D*(dog)", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\S*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"(cat)\w*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"(cat)\W*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\p{Lu}(\w*)\s\p{Lu}(\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "ello", "orld" } }; + yield return new object[] { engine, null, @"\P{Ll}\p{Ll}*\s\P{Ll}\p{Ll}*", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World" } }; + + // Use < in a group + yield return new object[] { engine, null, @"cat(?dog)", "catcatdogdogcat", RegexOptions.None, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s*(?dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; + yield return new object[] { engine, null, @"(?<1>cat)\s*(?<1>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; + yield return new object[] { engine, null, @"(?<2048>cat)\s*(?<2048>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?<-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?<1>cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?<2-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?<1>cat)\w+(?<2-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + + // Quantifiers + yield return new object[] { engine, null, @"(?cat){", "STARTcat{", RegexOptions.None, new string[] { "cat{", "cat" } }; + yield return new object[] { engine, null, @"(?cat){fdsa", "STARTcat{fdsa", RegexOptions.None, new string[] { "cat{fdsa", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1", "STARTcat{1", RegexOptions.None, new string[] { "cat{1", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1END", "STARTcat{1END", RegexOptions.None, new string[] { "cat{1END", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,", "STARTcat{1,", RegexOptions.None, new string[] { "cat{1,", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,END", "STARTcat{1,END", RegexOptions.None, new string[] { "cat{1,END", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,2", "STARTcat{1,2", RegexOptions.None, new string[] { "cat{1,2", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,2END", "STARTcat{1,2END", RegexOptions.None, new string[] { "cat{1,2END", "cat" } }; + + // Use IgnorePatternWhitespace + yield return new object[] { engine, null, @"(cat) #cat \s+ #followed by 1 or more whitespace (dog) #followed by dog ", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat) #cat + yield return new object[] { engine, null, @"(cat) #cat \s+ #followed by 1 or more whitespace (dog) #followed by dog", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat) (?#cat) \s+ (?#followed by 1 or more whitespace) (dog) (?#followed by dog)", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; - - // Back Reference - yield return new object[] { null, @"(?cat)(?dog)\k", "asdfcatdogcatdog", RegexOptions.None, new string[] { "catdogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\k", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\k'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - - yield return new object[] { null, @"(?cat)\s+(?dog)\k<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\k'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.ECMAScript, new string[] { "cat dogcat", "cat", "dog" } }; - - yield return new object[] { null, @"(?cat)\s+(?dog)\k", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.ECMAScript, new string[] { "cat dogdog", "cat", "dog" } }; - - // Octal - yield return new object[] { null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\176)", "hellocat~dogworld", RegexOptions.None, new string[] { "cat~", "cat", "~" } }; - yield return new object[] { null, @"(cat)(\400)", "hellocat\0dogworld", RegexOptions.None, new string[] { "cat\0", "cat", "\0" } }; - yield return new object[] { null, @"(cat)(\300)", "hellocat\u00C0dogworld", RegexOptions.None, new string[] { "cat\u00C0", "cat", "\u00C0" } }; - yield return new object[] { null, @"(cat)(\477)", "hellocat\u003Fdogworld", RegexOptions.None, new string[] { "cat\u003F", "cat", "\u003F" } }; - yield return new object[] { null, @"(cat)(\777)", "hellocat\u00FFdogworld", RegexOptions.None, new string[] { "cat\u00FF", "cat", "\u00FF" } }; - yield return new object[] { null, @"(cat)(\7770)", "hellocat\u00FF0dogworld", RegexOptions.None, new string[] { "cat\u00FF0", "cat", "\u00FF0" } }; - - yield return new object[] { null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\7)", "hellocat\adogworld", RegexOptions.ECMAScript, new string[] { "cat\a", "cat", "\a" } }; - yield return new object[] { null, @"(cat)(\40)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; - yield return new object[] { null, @"(cat)(\040)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; - yield return new object[] { null, @"(cat)(\176)", "hellocatcat76dogworld", RegexOptions.ECMAScript, new string[] { "catcat76", "cat", "cat76" } }; - yield return new object[] { null, @"(cat)(\377)", "hellocat\u00FFdogworld", RegexOptions.ECMAScript, new string[] { "cat\u00FF", "cat", "\u00FF" } }; - yield return new object[] { null, @"(cat)(\400)", "hellocat 0Fdogworld", RegexOptions.ECMAScript, new string[] { "cat 0", "cat", " 0" } }; - - // Decimal - yield return new object[] { null, @"(cat)\s+(?<2147483646>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(?<2147483647>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; - - // Hex - yield return new object[] { null, @"(cat)(\x2a*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; - yield return new object[] { null, @"(cat)(\x2b*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; - yield return new object[] { null, @"(cat)(\x2c*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; - yield return new object[] { null, @"(cat)(\x2d*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; - yield return new object[] { null, @"(cat)(\x2e*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; - yield return new object[] { null, @"(cat)(\x2f*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; - - yield return new object[] { null, @"(cat)(\x2A*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; - yield return new object[] { null, @"(cat)(\x2B*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; - yield return new object[] { null, @"(cat)(\x2C*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; - yield return new object[] { null, @"(cat)(\x2D*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; - yield return new object[] { null, @"(cat)(\x2E*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; - yield return new object[] { null, @"(cat)(\x2F*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; - - // ScanControl - yield return new object[] { null, @"(cat)(\c@*)(dog)", "asdlkcat\0\0dogiwod", RegexOptions.None, new string[] { "cat\0\0dog", "cat", "\0\0", "dog" } }; - yield return new object[] { null, @"(cat)(\cA*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; - yield return new object[] { null, @"(cat)(\ca*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; - - yield return new object[] { null, @"(cat)(\cC*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; - yield return new object[] { null, @"(cat)(\cc*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; - - yield return new object[] { null, @"(cat)(\cD*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; - yield return new object[] { null, @"(cat)(\cd*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; - - yield return new object[] { null, @"(cat)(\cX*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; - yield return new object[] { null, @"(cat)(\cx*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; - - yield return new object[] { null, @"(cat)(\cZ*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; - yield return new object[] { null, @"(cat)(\cz*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; - - if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759 - { - yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } }; - } + yield return new object[] { engine, null, @"(cat) (?#cat) \s+ (?#followed by 1 or more whitespace) (dog) (?#followed by dog)", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; + + // Back Reference + yield return new object[] { engine, null, @"(?cat)(?dog)\k", "asdfcatdogcatdog", RegexOptions.None, new string[] { "catdogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.ECMAScript, new string[] { "cat dogcat", "cat", "dog" } }; + + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.ECMAScript, new string[] { "cat dogdog", "cat", "dog" } }; + + // Octal + yield return new object[] { engine, null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\176)", "hellocat~dogworld", RegexOptions.None, new string[] { "cat~", "cat", "~" } }; + yield return new object[] { engine, null, @"(cat)(\400)", "hellocat\0dogworld", RegexOptions.None, new string[] { "cat\0", "cat", "\0" } }; + yield return new object[] { engine, null, @"(cat)(\300)", "hellocat\u00C0dogworld", RegexOptions.None, new string[] { "cat\u00C0", "cat", "\u00C0" } }; + yield return new object[] { engine, null, @"(cat)(\477)", "hellocat\u003Fdogworld", RegexOptions.None, new string[] { "cat\u003F", "cat", "\u003F" } }; + yield return new object[] { engine, null, @"(cat)(\777)", "hellocat\u00FFdogworld", RegexOptions.None, new string[] { "cat\u00FF", "cat", "\u00FF" } }; + yield return new object[] { engine, null, @"(cat)(\7770)", "hellocat\u00FF0dogworld", RegexOptions.None, new string[] { "cat\u00FF0", "cat", "\u00FF0" } }; + + yield return new object[] { engine, null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\7)", "hellocat\adogworld", RegexOptions.ECMAScript, new string[] { "cat\a", "cat", "\a" } }; + yield return new object[] { engine, null, @"(cat)(\40)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; + yield return new object[] { engine, null, @"(cat)(\040)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; + yield return new object[] { engine, null, @"(cat)(\176)", "hellocatcat76dogworld", RegexOptions.ECMAScript, new string[] { "catcat76", "cat", "cat76" } }; + yield return new object[] { engine, null, @"(cat)(\377)", "hellocat\u00FFdogworld", RegexOptions.ECMAScript, new string[] { "cat\u00FF", "cat", "\u00FF" } }; + yield return new object[] { engine, null, @"(cat)(\400)", "hellocat 0Fdogworld", RegexOptions.ECMAScript, new string[] { "cat 0", "cat", " 0" } }; + + // Decimal + yield return new object[] { engine, null, @"(cat)\s+(?<2147483646>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(?<2147483647>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; + + // Hex + yield return new object[] { engine, null, @"(cat)(\x2a*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2b*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2c*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2d*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2e*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2f*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\x2A*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2B*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2C*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2D*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2E*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2F*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; + + // ScanControl + yield return new object[] { engine, null, @"(cat)(\c@*)(dog)", "asdlkcat\0\0dogiwod", RegexOptions.None, new string[] { "cat\0\0dog", "cat", "\0\0", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cA*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\ca*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cC*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cc*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cD*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cd*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cX*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cx*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cZ*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cz*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; + + if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759 + { + yield return new object[] { engine, null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } }; + } - // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B - //\A - yield return new object[] { null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\G - yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //^ - yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"mouse\s\n^cat\s+dog", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog" } }; - yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" } }; - yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\Z - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\z - yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\b - yield return new object[] { null, @"\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @"\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @"\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; - yield return new object[] { null, @"\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "cat" } }; - yield return new object[] { null, @".*\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @".*\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "dog cat" } }; - yield return new object[] { null, @".*\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; - yield return new object[] { null, @".*\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "dog cat" } }; - yield return new object[] { null, @"\b@cat", "123START123@catEND", RegexOptions.None, new string[] { "@cat" } }; - yield return new object[] { null, @"\b\cat)\s+(?dog)\s+\123\s+\234", "asdfcat dog cat23 dog34eia", RegexOptions.ECMAScript, new string[] { "cat dog cat23 dog34", "cat", "dog" } }; - - // Balanced Matching - yield return new object[] { null, @"
+ // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B + //\A + yield return new object[] { engine, null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\G + yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //^ + yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"mouse\s\n^cat\s+dog", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog" } }; + yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" } }; + yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\Z + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\z + yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\b + yield return new object[] { engine, null, @"\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @"\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @"\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; + yield return new object[] { engine, null, @"\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "dog cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "dog cat" } }; + yield return new object[] { engine, null, @"\b@cat", "123START123@catEND", RegexOptions.None, new string[] { "@cat" } }; + yield return new object[] { engine, null, @"\b\cat)\s+(?dog)\s+\123\s+\234", "asdfcat dog cat23 dog34eia", RegexOptions.ECMAScript, new string[] { "cat dog cat23 dog34", "cat", "dog" } }; + + // Balanced Matching + yield return new object[] { engine, null, @"
(?>
(?) |
(?<-DEPTH>) | @@ -529,374 +531,390 @@ public static IEnumerable Groups_Basic_TestData() (?(DEPTH)(?!))
", "
this is some
red
text
", RegexOptions.IgnorePatternWhitespace, new string[] { "
this is some
red
text
", "" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( ((?'open'<+)[^<>]*)+ ((?'close-open'>+)[^<>]*)+ )+", "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", "<02deep_03<03deep_03>>>", "<03deep_03", ">>>", "<", "03deep_03" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( (?<)? [^<>]? (?>)? )*", "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", "", "", "01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( (?<[^/<>]*>)? [^<>]? (?]*>)? )*", "Cat", RegexOptions.IgnorePatternWhitespace, new string[] { "Cat", "", "", "Cat" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( (?<(?[^/<>]*)>)? [^<>]? (?>)? )*", "catdog", RegexOptions.IgnorePatternWhitespace, new string[] { "catdog", "", "", "a", "dog" } }; - // Balanced Matching With Backtracking - yield return new object[] { null, @"( + // Balanced Matching With Backtracking + yield return new object[] { engine, null, @"( (?<[^/<>]*>)? .? (?]*>)? )* (?(start)(?!)) ", "Cat<<<>>><><<<>>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "Cat<<<>>><><<<>>>>", "", "", "Cat" } }; - // Character Classes and Lazy quantifier - yield return new object[] { null, @"([0-9]+?)([\w]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55", "5", "5" } }; - yield return new object[] { null, @"([0-9]+?)([a-z]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55488a", "55488", "a" } }; + // Character Classes and Lazy quantifier + yield return new object[] { engine, null, @"([0-9]+?)([\w]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55", "5", "5" } }; + yield return new object[] { engine, null, @"([0-9]+?)([a-z]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55488a", "55488", "a" } }; - // Miscellaneous/Regression scenarios - yield return new object[] { null, @"(?1)(?.*?)(?=2)", "1" + Environment.NewLine + "" + Environment.NewLine + "2", RegexOptions.Singleline | RegexOptions.ExplicitCapture, + // Miscellaneous/Regression scenarios + yield return new object[] { engine, null, @"(?1)(?.*?)(?=2)", "1" + Environment.NewLine + "" + Environment.NewLine + "2", RegexOptions.Singleline | RegexOptions.ExplicitCapture, new string[] { "1" + Environment.NewLine + "" + Environment.NewLine, "1", Environment.NewLine + ""+ Environment.NewLine } }; - yield return new object[] { null, @"\G<%#(?.*?)?%>", @"<%# DataBinder.Eval(this, ""MyNumber"") %>", RegexOptions.Singleline, new string[] { @"<%# DataBinder.Eval(this, ""MyNumber"") %>", @" DataBinder.Eval(this, ""MyNumber"") " } }; - - // Nested Quantifiers - yield return new object[] { null, @"^[abcd]{0,0x10}*$", "a{0,0x10}}}", RegexOptions.None, new string[] { "a{0,0x10}}}" } }; - - // Lazy operator Backtracking - yield return new object[] { null, @"http://([a-zA-z0-9\-]*\.?)*?(:[0-9]*)??/", "http://www.msn.com/", RegexOptions.IgnoreCase, new string[] { "http://www.msn.com/", "com", string.Empty } }; - yield return new object[] { null, @"http://([a-zA-Z0-9\-]*\.?)*?/", @"http://www.google.com/", RegexOptions.IgnoreCase, new string[] { "http://www.google.com/", "com" } }; - - yield return new object[] { null, @"([a-z]*?)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "c", string.Empty, "c" } }; - yield return new object[] { null, @"^([a-z]*?)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; - - // Backtracking - yield return new object[] { null, @"([a-z]*)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; - yield return new object[] { null, @"^([a-z]*)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; - - // Backtracking with multiple (.*) groups -- important ASP.NET scenario - yield return new object[] { null, @"(.*)/(.*).aspx", "/.aspx", RegexOptions.None, new string[] { "/.aspx", string.Empty, string.Empty } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.*)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Backtracking with multiple (.+) groups - yield return new object[] { null, @"(.+)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.+)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Backtracking with (.+) group followed by (.*) - yield return new object[] { null, @"(.+)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.+)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Backtracking with (.*) group followed by (.+) - yield return new object[] { null, @"(.*)/(.+).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Quantifiers - yield return new object[] { null, @"a*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a*", "a", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @"a*", "aa", RegexOptions.None, new string[] { "aa" } }; - yield return new object[] { null, @"a*", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a*?", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a*?", "a", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a*?", "aa", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a+?", "aa", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @"a{1,", "a{1,", RegexOptions.None, new string[] { "a{1," } }; - yield return new object[] { null, @"a{1,3}", "aaaaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a{1,3}?", "aaaaa", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @"a{2,2}", "aaaaa", RegexOptions.None, new string[] { "aa" } }; - yield return new object[] { null, @"a{2,2}?", "aaaaa", RegexOptions.None, new string[] { "aa" } }; - yield return new object[] { null, @".{1,3}", "bb\nba", RegexOptions.None, new string[] { "bb" } }; - yield return new object[] { null, @".{1,3}?", "bb\nba", RegexOptions.None, new string[] { "b" } }; - yield return new object[] { null, @".{2,2}", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; - yield return new object[] { null, @".{2,2}?", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; - yield return new object[] { null, @"[abc]{1,3}", "ccaba", RegexOptions.None, new string[] { "cca" } }; - yield return new object[] { null, @"[abc]{1,3}?", "ccaba", RegexOptions.None, new string[] { "c" } }; - yield return new object[] { null, @"[abc]{2,2}", "ccaba", RegexOptions.None, new string[] { "cc" } }; - yield return new object[] { null, @"[abc]{2,2}?", "ccaba", RegexOptions.None, new string[] { "cc" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}?xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){2,2}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){2,2}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; - foreach (string prefix in new[] { "", "xyz" }) - { - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){2,2}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){2,2}?", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; - } - yield return new object[] { null, @"(cat){", "cat{", RegexOptions.None, new string[] { "cat{", "cat" } }; - yield return new object[] { null, @"(cat){}", "cat{}", RegexOptions.None, new string[] { "cat{}", "cat" } }; - yield return new object[] { null, @"(cat){,", "cat{,", RegexOptions.None, new string[] { "cat{,", "cat" } }; - yield return new object[] { null, @"(cat){,}", "cat{,}", RegexOptions.None, new string[] { "cat{,}", "cat" } }; - yield return new object[] { null, @"(cat){cat}", "cat{cat}", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; - yield return new object[] { null, @"(cat){cat,5}", "cat{cat,5}", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; - yield return new object[] { null, @"(cat){5,dog}", "cat{5,dog}", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; - yield return new object[] { null, @"(cat){cat,dog}", "cat{cat,dog}", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; - yield return new object[] { null, @"(cat){,}?", "cat{,}?", RegexOptions.None, new string[] { "cat{,}", "cat" } }; - yield return new object[] { null, @"(cat){cat}?", "cat{cat}?", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; - yield return new object[] { null, @"(cat){cat,5}?", "cat{cat,5}?", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; - yield return new object[] { null, @"(cat){5,dog}?", "cat{5,dog}?", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; - yield return new object[] { null, @"(cat){cat,dog}?", "cat{cat,dog}?", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; - - // Atomic subexpressions - // Implicitly upgrading (or not) oneloop to be atomic - yield return new object[] { null, @"a*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; - yield return new object[] { null, @"a*[bcd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[bcd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[bcd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*(?>[bcd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[bcd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*([bcd]ab|[bef]cd){1,3}", "aaababecdcac", RegexOptions.ExplicitCapture, new string[] { "aaababecd" } }; - yield return new object[] { null, @"a*([bcd]|[aef]){1,3}", "befb", RegexOptions.ExplicitCapture, new string[] { "bef" } }; // can't upgrade - yield return new object[] { null, @"a*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; - yield return new object[] { null, @"a*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; - yield return new object[] { null, @"@*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; - yield return new object[] { null, @"@*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; - yield return new object[] { null, @"(?:abcd*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; - yield return new object[] { null, @"(?:abcd|efgh*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; - yield return new object[] { null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } }; - yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } }; - yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } }; - yield return new object[] { null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } }; - yield return new object[] { null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } }; - // Implicitly upgrading (or not) notoneloop to be atomic - yield return new object[] { null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*bac", "aaabac", RegexOptions.None, new string[] { "aaabac" } }; - yield return new object[] { null, @"[^b]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade - yield return new object[] { null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } }; - yield return new object[] { null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade - yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } }; - yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } }; - // Implicitly upgrading (or not) setloop to be atomic - yield return new object[] { null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; - yield return new object[] { null, @"[ac]*[bd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[bd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[bd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*(?>[bd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[bd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; - yield return new object[] { null, @"[@']*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; - yield return new object[] { null, @"[@']*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; - yield return new object[] { null, @".*.", "@@@", RegexOptions.Singleline, new string[] { "@@@" } }; - yield return new object[] { null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade - yield return new object[] { null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade - yield return new object[] { null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } }; - yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } }; - yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } }; - yield return new object[] { null, @"a[^wz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; - yield return new object[] { null, @"a[^wyz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; - yield return new object[] { null, @"a[^wyz]*W", "abcdcdcdWz", RegexOptions.IgnoreCase, new string[] { "abcdcdcdW" } }; - // Implicitly upgrading (or not) concat loops to be atomic - yield return new object[] { null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdfbcef", RegexOptions.None, new string[] { "acdfbcef" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "cdfbcef", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)+", "cdfbcef", RegexOptions.None, new string[] { "bcef" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "bcefbcdfacfe", RegexOptions.None, new string[] { "bcefbcdf" } }; - // Implicitly upgrading (or not) nested loops to be atomic - yield return new object[] { null, @"(?:a){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"(?:a){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"(?:a{2}){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; - yield return new object[] { null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; - yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } }; - yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } }; - yield return new object[] { null, @"CN=(.*[^,]+).*", "CN=localhost", RegexOptions.Singleline, new string[] { "CN=localhost", "localhost" } }; - // Nested atomic - yield return new object[] { null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } }; - yield return new object[] { null, @"(?>(?:abc)*)", "abcabcabc", RegexOptions.None, new string[] { "abcabcabc" } }; - - // Anchoring loops beginning with .* / .+ - yield return new object[] { null, @".*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } }; - yield return new object[] { null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } }; - yield return new object[] { null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } }; - yield return new object[] { null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } }; - yield return new object[] { null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" }, "abc" }; // <-- Nonbacktracking match same as for "abc|.*123" - yield return new object[] { null, @"abc|.*123", "abc\n123", RegexOptions.Singleline, new string[] { "abc" } }; - yield return new object[] { null, @".*", "\n", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } }; - yield return new object[] { null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } }; - yield return new object[] { null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } }; - yield return new object[] { null, @".*", "abc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } }; - yield return new object[] { null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" }, "abcghi" }; // <-- Nonbacktracking match same as for ".*ghi|.*abc" - yield return new object[] { null, @".*ghi|.*abc", "abcghi", RegexOptions.None, new string[] { "abcghi" } }; - yield return new object[] { null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } }; - yield return new object[] { null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } }; - yield return new object[] { null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; - yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; - yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; - yield return new object[] { null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; - yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } }; - yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } }; - yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } }; - yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } }; - yield return new object[] { null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } }; - yield return new object[] { null, @".+", "a", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } }; - yield return new object[] { null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } }; - yield return new object[] { null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } }; - yield return new object[] { null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; - yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; - yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; - yield return new object[] { null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; - - // Unanchored .* - yield return new object[] { null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Name)", RegexOptions.None, new string[] { "Match(Name)", "(Name)", "Match", "Name" } }; - yield return new object[] { null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Na\nme)", RegexOptions.Singleline, new string[] { "Match(Na\nme)", "(Na\nme)", "Match", "Na\nme" } }; - foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline }) - { - yield return new object[] { null, @"abcd.*", @"abcabcd", options, new string[] { "abcd" } }; - yield return new object[] { null, @"abcd.*", @"abcabcde", options, new string[] { "abcde" } }; - yield return new object[] { null, @"abcd.*", @"abcabcdefg", options, new string[] { "abcdefg" } }; - yield return new object[] { null, @"abcd(.*)", @"ababcd", options, new string[] { "abcd", "" } }; - yield return new object[] { null, @"abcd(.*)", @"aabcde", options, new string[] { "abcde", "e" } }; - yield return new object[] { null, @"abcd(.*)", @"abcabcdefg", options, new string[] { "abcdefg", "efg" } }; - yield return new object[] { null, @"abcd(.*)e", @"abcabcdefg", options, new string[] { "abcde", "" } }; - yield return new object[] { null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } }; - } + yield return new object[] { engine, null, @"\G<%#(?.*?)?%>", @"<%# DataBinder.Eval(this, ""MyNumber"") %>", RegexOptions.Singleline, new string[] { @"<%# DataBinder.Eval(this, ""MyNumber"") %>", @" DataBinder.Eval(this, ""MyNumber"") " } }; + + // Nested Quantifiers + yield return new object[] { engine, null, @"^[abcd]{0,0x10}*$", "a{0,0x10}}}", RegexOptions.None, new string[] { "a{0,0x10}}}" } }; + + // Lazy operator Backtracking + yield return new object[] { engine, null, @"http://([a-zA-z0-9\-]*\.?)*?(:[0-9]*)??/", "http://www.msn.com/", RegexOptions.IgnoreCase, new string[] { "http://www.msn.com/", "com", string.Empty } }; + yield return new object[] { engine, null, @"http://([a-zA-Z0-9\-]*\.?)*?/", @"http://www.google.com/", RegexOptions.IgnoreCase, new string[] { "http://www.google.com/", "com" } }; + + yield return new object[] { engine, null, @"([a-z]*?)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "c", string.Empty, "c" } }; + yield return new object[] { engine, null, @"^([a-z]*?)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; + + // Backtracking + yield return new object[] { engine, null, @"([a-z]*)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; + yield return new object[] { engine, null, @"^([a-z]*)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; + + // Backtracking with multiple (.*) groups -- important ASP.NET scenario + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/.aspx", RegexOptions.None, new string[] { "/.aspx", string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Backtracking with multiple (.+) groups + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Backtracking with (.+) group followed by (.*) + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Backtracking with (.*) group followed by (.+) + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Quantifiers + yield return new object[] { engine, null, @"a*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a*", "a", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @"a*", "aa", RegexOptions.None, new string[] { "aa" } }; + yield return new object[] { engine, null, @"a*", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*?", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a*?", "a", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a*?", "aa", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a+?", "aa", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @"a{1,", "a{1,", RegexOptions.None, new string[] { "a{1," } }; + yield return new object[] { engine, null, @"a{1,3}", "aaaaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a{1,3}?", "aaaaa", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @"a{2,2}", "aaaaa", RegexOptions.None, new string[] { "aa" } }; + yield return new object[] { engine, null, @"a{2,2}?", "aaaaa", RegexOptions.None, new string[] { "aa" } }; + yield return new object[] { engine, null, @".{1,3}", "bb\nba", RegexOptions.None, new string[] { "bb" } }; + yield return new object[] { engine, null, @".{1,3}?", "bb\nba", RegexOptions.None, new string[] { "b" } }; + yield return new object[] { engine, null, @".{2,2}", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; + yield return new object[] { engine, null, @".{2,2}?", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; + yield return new object[] { engine, null, @"[abc]{1,3}", "ccaba", RegexOptions.None, new string[] { "cca" } }; + yield return new object[] { engine, null, @"[abc]{1,3}?", "ccaba", RegexOptions.None, new string[] { "c" } }; + yield return new object[] { engine, null, @"[abc]{2,2}", "ccaba", RegexOptions.None, new string[] { "cc" } }; + yield return new object[] { engine, null, @"[abc]{2,2}?", "ccaba", RegexOptions.None, new string[] { "cc" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}?xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){2,2}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){2,2}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; + foreach (string prefix in new[] { "", "xyz" }) + { + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){2,2}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){2,2}?", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; + } + yield return new object[] { engine, null, @"(cat){", "cat{", RegexOptions.None, new string[] { "cat{", "cat" } }; + yield return new object[] { engine, null, @"(cat){}", "cat{}", RegexOptions.None, new string[] { "cat{}", "cat" } }; + yield return new object[] { engine, null, @"(cat){,", "cat{,", RegexOptions.None, new string[] { "cat{,", "cat" } }; + yield return new object[] { engine, null, @"(cat){,}", "cat{,}", RegexOptions.None, new string[] { "cat{,}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat}", "cat{cat}", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,5}", "cat{cat,5}", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; + yield return new object[] { engine, null, @"(cat){5,dog}", "cat{5,dog}", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,dog}", "cat{cat,dog}", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; + yield return new object[] { engine, null, @"(cat){,}?", "cat{,}?", RegexOptions.None, new string[] { "cat{,}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat}?", "cat{cat}?", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,5}?", "cat{cat,5}?", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; + yield return new object[] { engine, null, @"(cat){5,dog}?", "cat{5,dog}?", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,dog}?", "cat{cat,dog}?", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; + + // Atomic subexpressions + // Implicitly upgrading (or not) oneloop to be atomic + yield return new object[] { engine, null, @"a*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; + yield return new object[] { engine, null, @"a*[bcd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[bcd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[bcd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*(?>[bcd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[bcd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*([bcd]ab|[bef]cd){1,3}", "aaababecdcac", RegexOptions.ExplicitCapture, new string[] { "aaababecd" } }; + yield return new object[] { engine, null, @"a*([bcd]|[aef]){1,3}", "befb", RegexOptions.ExplicitCapture, new string[] { "bef" } }; // can't upgrade + yield return new object[] { engine, null, @"a*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"@*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"@*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"(?:abcd*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; + yield return new object[] { engine, null, @"(?:abcd|efgh*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; + yield return new object[] { engine, null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } }; + yield return new object[] { engine, null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } }; + yield return new object[] { engine, null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } }; + yield return new object[] { engine, null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } }; + yield return new object[] { engine, null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } }; + // Implicitly upgrading (or not) notoneloop to be atomic + yield return new object[] { engine, null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*bac", "aaabac", RegexOptions.None, new string[] { "aaabac" } }; + yield return new object[] { engine, null, @"[^b]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade + yield return new object[] { engine, null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } }; + yield return new object[] { engine, null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade + yield return new object[] { engine, null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } }; + yield return new object[] { engine, null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } }; + // Implicitly upgrading (or not) setloop to be atomic + yield return new object[] { engine, null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; + yield return new object[] { engine, null, @"[ac]*[bd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[bd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[bd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*(?>[bd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[bd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[@']*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"[@']*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; + yield return new object[] { engine, null, @".*.", "@@@", RegexOptions.Singleline, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade + yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade + yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } }; + yield return new object[] { engine, null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } }; + yield return new object[] { engine, null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } }; + yield return new object[] { engine, null, @"a[^wz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; + yield return new object[] { engine, null, @"a[^wyz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; + yield return new object[] { engine, null, @"a[^wyz]*W", "abcdcdcdWz", RegexOptions.IgnoreCase, new string[] { "abcdcdcdW" } }; + // Implicitly upgrading (or not) concat loops to be atomic + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "acdfbcef", RegexOptions.None, new string[] { "acdfbcef" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "cdfbcef", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)+", "cdfbcef", RegexOptions.None, new string[] { "bcef" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "bcefbcdfacfe", RegexOptions.None, new string[] { "bcefbcdf" } }; + // Implicitly upgrading (or not) nested loops to be atomic + yield return new object[] { engine, null, @"(?:a){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"(?:a){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"(?:a{2}){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; + yield return new object[] { engine, null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; + yield return new object[] { engine, null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } }; + yield return new object[] { engine, null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } }; + yield return new object[] { engine, null, @"CN=(.*[^,]+).*", "CN=localhost", RegexOptions.Singleline, new string[] { "CN=localhost", "localhost" } }; + // Nested atomic + yield return new object[] { engine, null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } }; + yield return new object[] { engine, null, @"(?>(?:abc)*)", "abcabcabc", RegexOptions.None, new string[] { "abcabcabc" } }; + + // Anchoring loops beginning with .* / .+ + yield return new object[] { engine, null, @".*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } }; + yield return new object[] { engine, null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } }; + yield return new object[] { engine, null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } }; + yield return new object[] { engine, null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } }; + yield return new object[] { engine, null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" }, "abc" }; // <-- Nonbacktracking match same as for "abc|.*123" + yield return new object[] { engine, null, @"abc|.*123", "abc\n123", RegexOptions.Singleline, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*", "\n", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } }; + yield return new object[] { engine, null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } }; + yield return new object[] { engine, null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } }; + yield return new object[] { engine, null, @".*", "abc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } }; + yield return new object[] { engine, null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" }, "abcghi" }; // <-- Nonbacktracking match same as for ".*ghi|.*abc" + yield return new object[] { engine, null, @".*ghi|.*abc", "abcghi", RegexOptions.None, new string[] { "abcghi" } }; + yield return new object[] { engine, null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } }; + yield return new object[] { engine, null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } }; + yield return new object[] { engine, null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; + yield return new object[] { engine, null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; + yield return new object[] { engine, null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; + yield return new object[] { engine, null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; + yield return new object[] { engine, null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } }; + yield return new object[] { engine, null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } }; + yield return new object[] { engine, null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } }; + yield return new object[] { engine, null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } }; + yield return new object[] { engine, null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } }; + yield return new object[] { engine, null, @".+", "a", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } }; + yield return new object[] { engine, null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } }; + yield return new object[] { engine, null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } }; + yield return new object[] { engine, null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; + yield return new object[] { engine, null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; + yield return new object[] { engine, null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; + yield return new object[] { engine, null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; + + // Unanchored .* + yield return new object[] { engine, null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Name)", RegexOptions.None, new string[] { "Match(Name)", "(Name)", "Match", "Name" } }; + yield return new object[] { engine, null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Na\nme)", RegexOptions.Singleline, new string[] { "Match(Na\nme)", "(Na\nme)", "Match", "Na\nme" } }; + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline }) + { + yield return new object[] { engine, null, @"abcd.*", @"abcabcd", options, new string[] { "abcd" } }; + yield return new object[] { engine, null, @"abcd.*", @"abcabcde", options, new string[] { "abcde" } }; + yield return new object[] { engine, null, @"abcd.*", @"abcabcdefg", options, new string[] { "abcdefg" } }; + yield return new object[] { engine, null, @"abcd(.*)", @"ababcd", options, new string[] { "abcd", "" } }; + yield return new object[] { engine, null, @"abcd(.*)", @"aabcde", options, new string[] { "abcde", "e" } }; + yield return new object[] { engine, null, @"abcd(.*)", @"abcabcdefg", options, new string[] { "abcdefg", "efg" } }; + yield return new object[] { engine, null, @"abcd(.*)e", @"abcabcdefg", options, new string[] { "abcde", "" } }; + yield return new object[] { engine, null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } }; + } - // Grouping Constructs Invalid Regular Expressions - yield return new object[] { null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; - yield return new object[] { null, @"(?)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; - yield return new object[] { null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; - yield return new object[] { null, @"(?:)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?imn)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?imn)cat", "(?imn)cat", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @"(?=)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } }; - - // Alternation construct Invalid Regular Expressions - yield return new object[] { null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } }; - - yield return new object[] { null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } }; - - yield return new object[] { null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } }; - yield return new object[] { null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } }; - - yield return new object[] { null, @"(?(cat)|catdog)", "cat", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } }; - - // Invalid unicode - yield return new object[] { null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } }; - yield return new object[] { null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } }; - yield return new object[] { null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]]+", "bcxyzABCXYZ123890a", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, "[\u0000-\uFFFF-[\\p{P}\\p{S}\\p{C}]]+", "!@`';.,$+<>=\x0001\x001FazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; - - yield return new object[] { null, @"[\uFFFD-\uFFFF]+", "\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFD\uFFFE\uFFFF" } }; - yield return new object[] { null, @"[\uFFFC-\uFFFE]+", "\uFFFB\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFC\uFFFD\uFFFE" } }; - - // Empty Match - yield return new object[] { null, @"([a*]*)+?$", "ab", RegexOptions.None, new string[] { "", "" } }; - yield return new object[] { null, @"(a*)+?$", "b", RegexOptions.None, new string[] { "", "" } }; + // Grouping Constructs Invalid Regular Expressions + yield return new object[] { engine, null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(?)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(?:)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?imn)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?imn)cat", "(?imn)cat", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @"(?=)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } }; + + // Alternation construct Invalid Regular Expressions + yield return new object[] { engine, null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } }; + + yield return new object[] { engine, null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } }; + + yield return new object[] { engine, null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } }; + yield return new object[] { engine, null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } }; + + yield return new object[] { engine, null, @"(?(cat)|catdog)", "cat", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } }; + + // Invalid unicode + yield return new object[] { engine, null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } }; + yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } }; + yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]]+", "bcxyzABCXYZ123890a", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, "[\u0000-\uFFFF-[\\p{P}\\p{S}\\p{C}]]+", "!@`';.,$+<>=\x0001\x001FazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; + + yield return new object[] { engine, null, @"[\uFFFD-\uFFFF]+", "\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFD\uFFFE\uFFFF" } }; + yield return new object[] { engine, null, @"[\uFFFC-\uFFFE]+", "\uFFFB\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFC\uFFFD\uFFFE" } }; + + // Empty Match + yield return new object[] { engine, null, @"([a*]*)+?$", "ab", RegexOptions.None, new string[] { "", "" } }; + yield return new object[] { engine, null, @"(a*)+?$", "b", RegexOptions.None, new string[] { "", "" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_enUS() { - yield return new object[] { "en-US", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; - yield return new object[] { "en-US", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; - yield return new object[] { "en-US", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; - yield return new object[] { "en-US", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; - yield return new object[] { "en-US", "\u0130", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; - yield return new object[] { "en-US", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "en-US", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + yield return new object[] { engine, "en-US", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + yield return new object[] { engine, "en-US", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + yield return new object[] { engine, "en-US", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + yield return new object[] { engine, "en-US", "\u0130", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; + yield return new object[] { engine, "en-US", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_Czech() { - yield return new object[] { "cs-CZ", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; - yield return new object[] { "cs-CZ", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "cs-CZ", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + yield return new object[] { engine, "cs-CZ", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_Danish() { - yield return new object[] { "da-DK", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; - yield return new object[] { "da-DK", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "da-DK", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + yield return new object[] { engine, "da-DK", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_Turkish() { - yield return new object[] { "tr-TR", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; - yield return new object[] { "tr-TR", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "tr-TR", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; + yield return new object[] { engine, "tr-TR", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_AzeriLatin() { - if (PlatformDetection.IsNotBrowser) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - yield return new object[] { "az-Latn-AZ", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; - yield return new object[] { "az-Latn-AZ", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + if (PlatformDetection.IsNotBrowser) + { + yield return new object[] { engine, "az-Latn-AZ", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; + yield return new object[] { engine, "az-Latn-AZ", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + } } } @@ -909,7 +927,7 @@ public static IEnumerable Groups_CustomCulture_TestData_AzeriLatin() [MemberData(nameof(Groups_CustomCulture_TestData_AzeriLatin))] [ActiveIssue("https://github.com/dotnet/runtime/issues/56407", TestPlatforms.Android)] [ActiveIssue("https://github.com/dotnet/runtime/issues/36900", TestPlatforms.iOS | TestPlatforms.tvOS | TestPlatforms.MacCatalyst)] - public async Task Groups(string cultureName, string pattern, string input, RegexOptions options, string[] expectedGroups, string altMatch = null) + public async Task Groups(RegexEngine engine, string cultureName, string pattern, string input, RegexOptions options, string[] expectedGroups, string altMatch = null) { if (cultureName is null) { @@ -917,59 +935,50 @@ public async Task Groups(string cultureName, string pattern, string input, Regex cultureName = culture.Equals(CultureInfo.InvariantCulture) ? "en-US" : culture.Name; } - using (new ThreadCultureChange(cultureName)) - { - foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - { - // Alternative altMatch when order of alternations matters in backtracking but order does not matter in NonBacktracking mode - // Also in NonBacktracking there is only a single top-level match, which is expectedGroups[0] when altMatch is null - string[] expected = engine == RegexEngine.NonBacktracking ? - new string[] { altMatch ?? expectedGroups[0] } : - expectedGroups; + // Alternative altMatch when order of alternations matters in backtracking but order does not matter in NonBacktracking mode + // Also in NonBacktracking there is only a single top-level match, which is expectedGroups[0] when altMatch is null + expectedGroups = engine == RegexEngine.NonBacktracking ? + new string[] { altMatch ?? expectedGroups[0] } : + expectedGroups; - await GroupsAsync(engine, pattern, input, options, expected); - } + if (engine == RegexEngine.NonBacktracking && pattern.Contains("?(cat)")) + { + // General if-then-else construct is not supported and uses the ?(cat) condition in the tests + // TODO-NONBACKTRACKING: The constructor will throw NotSupportedException so this check will become obsolete + return; } - static async Task GroupsAsync(RegexEngine engine, string pattern, string input, RegexOptions options, string[] expectedGroups) + using var _ = new ThreadCultureChange(cultureName); + + Regex regex; + try { - if (engine == RegexEngine.NonBacktracking && pattern.Contains("?(cat)")) - { - // General if-then-else construct is not supported and uses the ?(cat) condition in the tests - // TODO-NONBACKTRACKING: The constructor will throw NotSupportedException so this check will become obsolete - return; - } + regex = await RegexHelpers.GetRegexAsync(engine, pattern, options); + } + catch (NotSupportedException) when (RegexHelpers.IsNonBacktracking(engine)) + { + // Some constructs are not supported in NonBacktracking mode, such as: if-then-else, lookaround, and backreferences + return; + } - Regex regex; - try - { - regex = await RegexHelpers.GetRegexAsync(engine, pattern, options); - } - catch (NotSupportedException) when (RegexHelpers.IsNonBacktracking(engine)) - { - // Some constructs are not supported in NonBacktracking mode, such as: if-then-else, lookaround, and backreferences - return; - } + Match match = regex.Match(input); - Match match = regex.Match(input); + Assert.True(match.Success); + Assert.Equal(expectedGroups[0], match.Value); - Assert.True(match.Success); - Assert.Equal(expectedGroups[0], match.Value); + if (!RegexHelpers.IsNonBacktracking(engine)) + { + Assert.Equal(expectedGroups.Length, match.Groups.Count); - if (!RegexHelpers.IsNonBacktracking(engine)) + int[] groupNumbers = regex.GetGroupNumbers(); + string[] groupNames = regex.GetGroupNames(); + for (int i = 0; i < expectedGroups.Length; i++) { - Assert.Equal(expectedGroups.Length, match.Groups.Count); - - int[] groupNumbers = regex.GetGroupNumbers(); - string[] groupNames = regex.GetGroupNames(); - for (int i = 0; i < expectedGroups.Length; i++) - { - Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value); - Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]); - - Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i])); - Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i])); - } + Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value); + Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]); + + Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i])); + Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i])); } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index e7f0b4d298553..5da3f4fcaa331 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -239,6 +239,8 @@ public static IEnumerable Match_MemberData() { yield return ("aaa(?i:match this)bbb", "aaaMaTcH ThIsbbb", RegexOptions.None, 0, 16, true, "aaaMaTcH ThIsbbb"); } + yield return ("(?i:a)b(?i:c)d", "aaaaAbCdddd", RegexOptions.None, 0, 11, true, "AbCd"); + yield return ("(?i:[\u0000-\u1000])[Bb]", "aaaaAbCdddd", RegexOptions.None, 0, 11, true, "Ab"); // Turning off case insensitive option in mid-pattern : Actual - "aaa(?-i:match this)bbb", "i" yield return ("aAa(?-i:match this)bbb", "AaAmatch thisBBb", RegexOptions.IgnoreCase, 0, 16, true, "AaAmatch thisBBb"); @@ -274,6 +276,8 @@ public static IEnumerable Match_MemberData() yield return (@"\p{Ll}", "1bc", RegexOptions.IgnoreCase, 0, 3, true, "b"); yield return (@"\p{Lt}", "1bc", RegexOptions.IgnoreCase, 0, 3, true, "b"); yield return (@"\p{Lo}", "1bc", RegexOptions.IgnoreCase, 0, 3, false, string.Empty); + yield return (".[abc]", "xYZAbC", RegexOptions.IgnoreCase, 0, 6, true, "ZA"); + yield return (".[abc]", "xYzXyZx", RegexOptions.IgnoreCase, 0, 6, false, ""); // "\D+" yield return (@"\D+", "12321", RegexOptions.None, 0, 5, false, string.Empty); @@ -360,7 +364,6 @@ public static IEnumerable Match_MemberData() yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); } yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); - yield return (@"a\w*a|def", "aaaaa", RegexOptions.None, 0, 5, true, "aaaaa"); // No Negation yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty); @@ -516,6 +519,8 @@ public static IEnumerable Match_MemberData() yield return (@".*\dFoo", "This1foo should 2FoO match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 26, true, "This1foo should 2FoO"); yield return (@".*\dFoo", "This1Foo should 2fOo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 26, true, "This1Foo should 2fOo"); yield return (@".*\dfoo", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 8, 4, true, "2FOO"); + yield return (@"[\w\s].*", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 30, true, "1fooThis2FOO should 1foo match"); + yield return (@"i.*", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 30, true, "is2FOO should 1foo match"); } // [ActiveIssue("https://github.com/dotnet/runtime/issues/36149")] @@ -537,6 +542,29 @@ public static IEnumerable Match_MemberData() // yield return (@"^(?i:[\u24B6-\u24D0])$", ((char)('\u24CF' + 26)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u24CF' + 26)).ToString()); //} + // Long inputs + string longCharacterRange = string.Concat(Enumerable.Range(1, 0x2000).Select(c => (char)c)); + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.IgnoreCase }) + { + yield return ("\u1000", longCharacterRange, options, 0, 0x2000, true, "\u1000"); + yield return ("[\u1000-\u1001]", longCharacterRange, options, 0, 0x2000, true, "\u1000"); + yield return ("[\u0FF0-\u0FFF][\u1000-\u1001]", longCharacterRange, options, 0, 0x2000, true, "\u0FFF\u1000"); + + yield return ("\uA640", longCharacterRange, options, 0, 0x2000, false, ""); + yield return ("[\u3000-\u3001]", longCharacterRange, options, 0, 0x2000, false, ""); + yield return ("[\uA640-\uA641][\u3000-\u3010]", longCharacterRange, options, 0, 0x2000, false, ""); + + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return ("\u1000", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, true, "\u1000"); + yield return ("[\u1000-\u1001]", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, true, "\u1001"); + yield return ("[\u1000][\u1001-\u1010]", longCharacterRange, options, 0, 0x2000, true, "\u1000\u1001"); + + yield return ("\uA640", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, false, ""); + yield return ("[\u3000-\u3001][\uA640-\uA641]", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, false, ""); + } + } + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline }) { yield return (@"\W.*?\D", "seq 012 of 3 digits", options, 0, 19, true, " 012 "); @@ -1283,13 +1311,11 @@ public void Match_ExcessPrefix(RegexEngine engine) // Repeaters Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")).IsMatch("a")); - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in debug - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{51,}")).IsMatch("a")); + Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in release - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_001,}")).IsMatch("a")); // Multis - foreach (int length in new[] { 50, 51, 50_000, 50_001, char.MaxValue + 1 }) // based on knowledge of cut-offs used in Boyer-Moore + foreach (int length in new[] { 50, 50_000, char.MaxValue + 1 }) { // The large counters are too slow for counting a's in NonBacktracking engine // They will incur a constant of size length because in .*a{k} after reading n a's the diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs index b325f1c0bf2c7..e1792c623be21 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs @@ -48,11 +48,11 @@ public static bool IsDefaultStart(string input, RegexOptions options, int start) return start == 0; } - public static Regex CreateRegexInCulture(string pattern, RegexOptions options, Globalization.CultureInfo culture) + public static async Task GetRegexAsync(RegexEngine engine, string pattern, RegexOptions options, Globalization.CultureInfo culture) { using (new System.Tests.ThreadCultureChange(culture)) { - return new Regex(pattern, options); + return await GetRegexAsync(engine, pattern, options); } } @@ -116,7 +116,7 @@ public static async Task GetRegexAsync(RegexEngine engine, string pattern // - Handle NonBacktrackingSourceGenerated return - options is null ? new Regex(pattern, RegexOptions.Compiled | OptionsFromEngine(engine)) : + options is null ? new Regex(pattern, OptionsFromEngine(engine)) : matchTimeout is null ? new Regex(pattern, options.Value | OptionsFromEngine(engine)) : new Regex(pattern, options.Value | OptionsFromEngine(engine), matchTimeout.Value); } @@ -136,7 +136,7 @@ public static async Task GetRegexesAsync(RegexEngine engine, params (st { (string pattern, RegexOptions? options, TimeSpan? matchTimeout) = regexes[i]; results[i] = - options is null ? new Regex(pattern, RegexOptions.Compiled | OptionsFromEngine(engine)) : + options is null ? new Regex(pattern, OptionsFromEngine(engine)) : matchTimeout is null ? new Regex(pattern, options.Value | OptionsFromEngine(engine)) : new Regex(pattern, options.Value | OptionsFromEngine(engine), matchTimeout.Value); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs index adcde90c42b97..028afabe61d9e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -12,62 +12,43 @@ namespace System.Text.RegularExpressions.Tests { public class RegexCultureTests { - // TODO: Validate source generator after figuring out what to do with culture - - public static IEnumerable RegexOptionsExtended() - { - yield return RegexOptions.None; - yield return RegexOptions.Compiled; - if (PlatformDetection.IsNetCore) - { - yield return RegexHelpers.RegexOptionNonBacktracking; - } - } - - public static IEnumerable RegexOptionsExtended_MemberData() => - from options in RegexOptionsExtended() - select new object[] { options }; - public static IEnumerable CharactersComparedOneByOne_AnchoredPattern_TestData() { - foreach (RegexOptions options in RegexOptionsExtended()) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - yield return new object[] { "^aa$", "aA", "da-DK", options, false }; - yield return new object[] { "^aA$", "aA", "da-DK", options, true }; - yield return new object[] { "^aa$", "aA", "da-DK", options | RegexOptions.IgnoreCase, true }; - yield return new object[] { "^aA$", "aA", "da-DK", options | RegexOptions.IgnoreCase, true }; + yield return new object[] { engine, "^aa$", "aA", "da-DK", RegexOptions.None, false }; + yield return new object[] { engine, "^aA$", "aA", "da-DK", RegexOptions.None, true }; + yield return new object[] { engine, "^aa$", "aA", "da-DK", RegexOptions.IgnoreCase, true }; + yield return new object[] { engine, "^aA$", "aA", "da-DK", RegexOptions.IgnoreCase, true }; } } [Theory] [MemberData(nameof(CharactersComparedOneByOne_AnchoredPattern_TestData))] - public void CharactersComparedOneByOne_AnchoredPattern(string pattern, string input, string culture, RegexOptions options, bool expected) + public async Task CharactersComparedOneByOne_AnchoredPattern(RegexEngine engine, string pattern, string input, string culture, RegexOptions options, bool expected) { // Regex compares characters one by one. If that changes, it could impact the behavior of // a case like this, where these characters are not the same, but the strings compare // as equal with the invariant culture (and some other cultures as well). using (new ThreadCultureChange(culture)) { - foreach (RegexOptions compiled in new[] { RegexOptions.None, RegexOptions.Compiled }) - { - Assert.Equal(expected, new Regex(pattern, options | compiled).IsMatch(input)); - } + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, options); + Assert.Equal(expected, r.IsMatch(input)); } } - public static IEnumerable CharactersComparedOneByOne_Invariant_TestData() { - foreach (RegexOptions options in RegexOptionsExtended()) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - yield return new object[] { options }; - yield return new object[] { options | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant }; + yield return new object[] { engine, RegexOptions.None }; + yield return new object[] { engine, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant }; } } [Theory] [MemberData(nameof(CharactersComparedOneByOne_Invariant_TestData))] - public void CharactersComparedOneByOne_Invariant(RegexOptions options) + public async Task CharactersComparedOneByOne_Invariant(RegexEngine engine, RegexOptions options) { // Regex compares characters one by one. If that changes, it could impact the behavior of // a case like this, where these characters are not the same, but the strings compare @@ -88,35 +69,20 @@ public void CharactersComparedOneByOne_Invariant(RegexOptions options) string input = string.Concat(Enumerable.Repeat(S2, multiple)); Regex r; - // Validate when the string is at the beginning of the pattern, as it impacts Boyer-Moore prefix matching. - r = new Regex(pattern, options); + // Validate when the string is at the beginning of the pattern, as it impacts prefix matching. + r = await RegexHelpers.GetRegexAsync(engine, pattern, options); Assert.False(r.IsMatch(input)); Assert.True(r.IsMatch(pattern)); // Validate when it's not at the beginning of the pattern, as it impacts "multi" matching. - r = new Regex("[abc]" + pattern, options); + r = await RegexHelpers.GetRegexAsync(engine, "[abc]" + pattern, options); Assert.False(r.IsMatch("a" + input)); Assert.True(r.IsMatch("a" + pattern)); } } - public static IEnumerable CharactersLowercasedOneByOne_MemberData() - { - foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - { - switch (engine) - { - case RegexEngine.SourceGenerated: - case RegexEngine.NonBacktrackingSourceGenerated: - continue; - } - - yield return new object[] { engine }; - } - } - [Theory] - [MemberData(nameof(CharactersLowercasedOneByOne_MemberData))] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] public async Task CharactersLowercasedOneByOne(RegexEngine engine) { using (new ThreadCultureChange("en-US")) @@ -191,15 +157,15 @@ Regex[] Create(string input, CultureInfo info, RegexOptions additional) [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Doesn't support NonBacktracking")] [Fact] [ActiveIssue("https://github.com/dotnet/runtime/issues/60568", TestPlatforms.Android)] - public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktracking() + public async Task TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktracking() { var turkish = new CultureInfo("tr-TR"); string input = "I\u0131\u0130i"; // Use the input as the regex also // Ignore the Compiled option here because it is a noop in combination with NonBacktracking - Regex cultInvariantRegex = RegexHelpers.CreateRegexInCulture(input, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, CultureInfo.InvariantCulture); - Regex turkishRegex = RegexHelpers.CreateRegexInCulture(input, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.IgnoreCase, turkish); + Regex cultInvariantRegex = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, input, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, CultureInfo.InvariantCulture); + Regex turkishRegex = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, input, RegexOptions.IgnoreCase, turkish); Assert.True(cultInvariantRegex.IsMatch(input)); Assert.True(turkishRegex.IsMatch(input)); // <---------- This result differs from the result in the previous test!!! @@ -220,60 +186,70 @@ public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktr Assert.True(turkishRegex.IsMatch(input.ToUpper(turkish))); } - [ActiveIssue("Incorrect handling of IgnoreCase over intervals in Turkish Culture, https://github.com/dotnet/runtime/issues/58958")] - [Fact] - public void TurkishCulture_Handling_Of_IgnoreCase() + [ActiveIssue("https://github.com/dotnet/runtime/issues/58958")] + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task TurkishCulture_Handling_Of_IgnoreCase(RegexEngine engine) { var turkish = new CultureInfo("tr-TR"); string input = "I\u0131\u0130i"; string pattern = "[H-J][\u0131-\u0140][\u0120-\u0130][h-j]"; - Regex regex = RegexHelpers.CreateRegexInCulture(pattern, RegexOptions.IgnoreCase, turkish); + Regex regex = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.IgnoreCase, turkish); // The pattern must trivially match the input because all of the letters fall in the given intervals // Ignoring case can only add more letters here -- not REMOVE letters Assert.True(regex.IsMatch(input)); } - [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Doesn't support NonBacktracking")] - [Fact] - public void TurkishCulture_Handling_Of_IgnoreCase_NonBacktracking() + public static IEnumerable TurkishCulture_MatchesWordChar_MemberData() { - var turkish = new CultureInfo("tr-TR"); - string input = "I\u0131\u0130i"; - string pattern = "[H-J][\u0131-\u0140][\u0120-\u0130][h-j]"; - - Regex regex = RegexHelpers.CreateRegexInCulture(pattern, RegexOptions.IgnoreCase | RegexHelpers.RegexOptionNonBacktracking, turkish); + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.None, "I\u0131\u0130i" }; + yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.IgnoreCase, "I\u0131\u0130i" }; + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.IgnoreCase | RegexOptions.ECMAScript, "" }; + } + } + } - // The pattern must trivially match the input because all of the letters fall in the given intervals - // Ignoring case can only add more letters here -- not REMOVE letters - Assert.True(regex.IsMatch(input)); + [Theory] + [MemberData(nameof(TurkishCulture_MatchesWordChar_MemberData))] + public async Task TurkishCulture_MatchesWordChar(RegexEngine engine, string input, RegexOptions options, string expectedResult) + { + using (new ThreadCultureChange(new CultureInfo("tr-TR"))) + { + Regex regex = await RegexHelpers.GetRegexAsync(engine, @"\w*", options); + Assert.Equal(expectedResult, regex.Match(input).Value); + } } public static IEnumerable Match_In_Different_Cultures_TestData() { CultureInfo invariant = CultureInfo.InvariantCulture; - CultureInfo current = CultureInfo.CurrentCulture; + CultureInfo enUS = new CultureInfo("en-US"); CultureInfo turkish = new CultureInfo("tr-TR"); - foreach (RegexOptions options in RegexOptionsExtended()) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { // \u0130 (Turkish I with dot) and \u0131 (Turkish i without dot) are unrelated characters in general // Expected answers in the default en-US culture - yield return new object[] { "(?i:I)", options, current, "xy\u0131ab", "" }; - yield return new object[] { "(?i:iI+)", options, current, "abcIIIxyz", "III" }; - yield return new object[] { "(?i:iI+)", options, current, "abcIi\u0130xyz", "Ii\u0130" }; - yield return new object[] { "(?i:iI+)", options, current, "abcI\u0130ixyz", "I\u0130i" }; - yield return new object[] { "(?i:iI+)", options, current, "abc\u0130IIxyz", "\u0130II" }; - yield return new object[] { "(?i:iI+)", options, current, "abc\u0130\u0131Ixyz", "" }; - yield return new object[] { "(?i:iI+)", options, current, "abc\u0130Iixyz", "\u0130Ii" }; - yield return new object[] { "(?i:[^IJKLM]I)", options, current, "ii\u0130i\u0131ab", "" }; + yield return new object[] { "(?i:I)", engine, enUS, "xy\u0131ab", "" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abcIIIxyz", "III" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abcIi\u0130xyz", "Ii\u0130" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abcI\u0130ixyz", "I\u0130i" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130IIxyz", "\u0130II" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130\u0131Ixyz", "" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130Iixyz", "\u0130Ii" }; + yield return new object[] { "(?i:[^IJKLM]I)", engine, enUS, "ii\u0130i\u0131ab", "" }; // Expected answers in the invariant culture - yield return new object[] { "(?i:I)", options, invariant, "xy\u0131ab", "" }; - yield return new object[] { "(?i:iI+)", options, invariant, "abcIIIxyz", "III" }; - yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130\u0131Ixyz", "" }; + yield return new object[] { "(?i:I)", engine, invariant, "xy\u0131ab", "" }; + yield return new object[] { "(?i:iI+)", engine, invariant, "abcIIIxyz", "III" }; + yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130\u0131Ixyz", "" }; // Expected answers in the Turkish culture // @@ -281,17 +257,17 @@ public static IEnumerable Match_In_Different_Cultures_TestData() // https://github.com/dotnet/runtime/issues/60568 if (!PlatformDetection.IsAndroid) { - yield return new object[] { "(?i:I)", options, turkish, "xy\u0131ab", "\u0131" }; - yield return new object[] { "(?i:iI+)", options, turkish, "abcIIIxyz", "" }; - yield return new object[] { "(?i:iI+)", options, turkish, "abcIi\u0130xyz", "" }; - yield return new object[] { "(?i:iI+)", options, turkish, "abcI\u0130ixyz", "" }; - yield return new object[] { "(?i:[^IJKLM]I)", options, turkish, "ii\u0130i\u0131ab", "i\u0131" }; + yield return new object[] { "(?i:I)", engine, turkish, "xy\u0131ab", "\u0131" }; + yield return new object[] { "(?i:iI+)", engine, turkish, "abcIIIxyz", "" }; + yield return new object[] { "(?i:iI+)", engine, turkish, "abcIi\u0130xyz", "" }; + yield return new object[] { "(?i:iI+)", engine, turkish, "abcI\u0130ixyz", "" }; + yield return new object[] { "(?i:[^IJKLM]I)", engine, turkish, "ii\u0130i\u0131ab", "i\u0131" }; } // None and Compiled are separated into the Match_In_Different_Cultures_CriticalCases test - if (options == RegexHelpers.RegexOptionNonBacktracking) + if (RegexHelpers.IsNonBacktracking(engine)) { - foreach (object[] data in Match_In_Different_Cultures_CriticalCases_TestData_For(options)) + foreach (object[] data in Match_In_Different_Cultures_CriticalCases_TestData_For(engine)) { yield return data; } @@ -299,39 +275,39 @@ public static IEnumerable Match_In_Different_Cultures_TestData() } } - public static IEnumerable Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions options) + public static IEnumerable Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine engine) { CultureInfo invariant = CultureInfo.InvariantCulture; CultureInfo turkish = new CultureInfo("tr-TR"); // Expected answers in the invariant culture - yield return new object[] { "(?i:iI+)", options, invariant, "abcIi\u0130xyz", "Ii" }; // <-- failing for None, Compiled - yield return new object[] { "(?i:iI+)", options, invariant, "abcI\u0130ixyz", "" }; // <-- failing for Compiled - yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130IIxyz", "II" }; // <-- failing for Compiled - yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130Iixyz", "Ii" }; // <-- failing for Compiled - yield return new object[] { "(?i:[^IJKLM]I)", options, invariant, "ii\u0130i\u0131ab", "\u0130i" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abcIi\u0130xyz", "Ii" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abcI\u0130ixyz", "" }; // <-- failing for Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130IIxyz", "II" }; // <-- failing for Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130Iixyz", "Ii" }; // <-- failing for Compiled + yield return new object[] { "(?i:[^IJKLM]I)", engine, invariant, "ii\u0130i\u0131ab", "\u0130i" }; // <-- failing for None, Compiled // Expected answers in the Turkish culture // Android produces unexpected results for tr-TR // https://github.com/dotnet/runtime/issues/60568 if (!PlatformDetection.IsAndroid) { - yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130IIxyz", "\u0130II" }; // <-- failing for None, Compiled - yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130\u0131Ixyz", "\u0130\u0131I" }; // <-- failing for None, Compiled - yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130Iixyz", "\u0130I" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130IIxyz", "\u0130II" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130\u0131Ixyz", "\u0130\u0131I" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130Iixyz", "\u0130I" }; // <-- failing for None, Compiled } } public static IEnumerable Match_In_Different_Cultures_CriticalCases_TestData() => - Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions.None).Union(Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions.Compiled)); + Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine.Interpreter).Union(Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine.Compiled)); [ActiveIssue("https://github.com/dotnet/runtime/issues/60899", TestPlatforms.Browser)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/60697", TestPlatforms.iOS | TestPlatforms.tvOS)] [Theory] [MemberData(nameof(Match_In_Different_Cultures_TestData))] - [ActiveIssue("https://github.com/dotnet/runtime/issues/60697", TestPlatforms.iOS | TestPlatforms.tvOS)] - public void Match_In_Different_Cultures(string pattern, RegexOptions options, CultureInfo culture, string input, string match_expected) + public async Task Match_In_Different_Cultures(string pattern, RegexEngine engine, CultureInfo culture, string input, string match_expected) { - Regex r = RegexHelpers.CreateRegexInCulture(pattern, options, culture); + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.None, culture); Match match = r.Match(input); Assert.Equal(match_expected, match.Value); } @@ -339,9 +315,9 @@ public void Match_In_Different_Cultures(string pattern, RegexOptions options, Cu [ActiveIssue("Incorrect treatment of IgnoreCase in Turkish and Invariant cultures, https://github.com/dotnet/runtime/issues/58956, https://github.com/dotnet/runtime/issues/58958 ")] [Theory] [MemberData(nameof(Match_In_Different_Cultures_CriticalCases_TestData))] - public void Match_In_Different_Cultures_CriticalCases(string pattern, RegexOptions options, CultureInfo culture, string input, string match_expected) + public async Task Match_In_Different_Cultures_CriticalCases(string pattern, RegexEngine engine, CultureInfo culture, string input, string match_expected) { - Regex r = RegexHelpers.CreateRegexInCulture(pattern, options, culture); + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.None, culture); Match match = r.Match(input); Assert.Equal(match_expected, match.Value); } @@ -367,9 +343,8 @@ public void Match_InvariantCulture_None_vs_Compiled() ///
[OuterLoop("May take several seconds due to large number of cultures tested")] [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)] - [Theory] - [MemberData(nameof(RegexOptionsExtended_MemberData))] - public void TestIgnoreCaseRelation(RegexOptions options) + [Fact] + public void TestIgnoreCaseRelation() { // these 22 characters are considered case-insensitive by regex, while they are case-sensitive outside regex // but they are only case-sensitive in an asymmmetrical way: tolower(c)=c, tolower(toupper(c)) != c @@ -380,10 +355,10 @@ public void TestIgnoreCaseRelation(RegexOptions options) { char cU = char.ToUpper(c); Assert.NotEqual(c, cU); - Assert.False(Regex.IsMatch(c.ToString(), cU.ToString(), options | RegexOptions.IgnoreCase)); + Assert.False(Regex.IsMatch(c.ToString(), cU.ToString(), RegexOptions.IgnoreCase)); } - Assert.False(Regex.IsMatch(Turkish_i_withoutDot.ToString(), "i", options | RegexOptions.IgnoreCase)); + Assert.False(Regex.IsMatch(Turkish_i_withoutDot.ToString(), "i", RegexOptions.IgnoreCase)); // as baseline it is assumed the the invariant culture does not change HashSet[] inv_table = ComputeIgnoreCaseTable(CultureInfo.InvariantCulture, treatedAsCaseInsensitive); diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs index ee6339561b8bc..3f27aab3d89b8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs @@ -33,11 +33,14 @@ public class RegexExperiment /// Output directory for generated dgml files. private static string DgmlOutputDirectoryPath => Path.Combine(s_tmpWorkingDir, "dgml"); - private static string ExperimentDirectoryPath => Path.Combine(s_tmpWorkingDir, "experiments"); - - [ConditionalFact(nameof(Enabled))] + [Fact] public void RegenerateUnicodeTables() { + if (!Enabled) + { + return; + } + MethodInfo? genUnicode = typeof(Regex).GetMethod("GenerateUnicodeTables", BindingFlags.NonPublic | BindingFlags.Static); // GenerateUnicodeTables is not available in Release build if (genUnicode is not null) @@ -46,9 +49,6 @@ public void RegenerateUnicodeTables() } } - private static void WriteOutput(string message) => - File.AppendAllText(OutputFilePath, message); - /// Save the regex as a DFA in DGML format in the textwriter. private static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hideStateInfo = false, bool addDotStar = false, bool inReverse = false, bool onlyDFAinfo = false, int maxLabelLength = -1, bool asNFA = false) { @@ -85,60 +85,6 @@ internal static void ViewDGML(Regex regex, int bound = -1, bool hideStateInfo = } } - /// - /// The intent is that this method is run in realease build for lightweight performance testing. - /// One can e.g. open the outputfile in emacs with AUTO-REVERT-ON in order to follow the progress in real time. - /// It will print timing info and match info for both DFA, Compiled option and None. - /// Place sample regexes in the regexesfile (one per line) and sample input in inputfile. - /// It will essentially produce a csv file with the info: - /// regexnr, matchtime_DFA, result_DFA, matchtime_Compiled, result_Compiled, matchtime_None, result_None, - /// where result_.. is one of - /// Yes(index,length) - /// No - /// TIMEOUT - /// ERROR - /// and in the case of TIMEOUT or ERROR time is 10000 (the timeout limit of 10sec) - /// - [ConditionalFact(nameof(Enabled))] - public void TestRunPerformance() - { - if (!Directory.Exists(ExperimentDirectoryPath)) - { - Directory.CreateDirectory(ExperimentDirectoryPath); - } - - string[] dirs = Directory.GetDirectories(ExperimentDirectoryPath); - if (dirs.Length == 0) - { - WriteOutput("\nExperiments directory is empty"); - return; - } - - DirectoryInfo experimentDI = Directory.GetParent(dirs[0]); - DirectoryInfo[] experiments = - Array.FindAll(experimentDI.GetDirectories(), - di => ((di.Attributes & FileAttributes.Hidden) != (FileAttributes.Hidden)) && - Array.Exists(di.GetFiles(), f => f.Name.Equals("regexes.txt")) && - Array.Exists(di.GetFiles(), f => f.Name.Equals("input.txt"))); - if (experiments.Length == 0) - { - WriteOutput("\nExperiments directory has no indiviual experiment subdirectories containing files 'regexes.txt' and 'input.txt'."); - return; - } - - for (int i = 0; i < experiments.Length; i++) - { - string input = File.ReadAllText(Path.Combine(experiments[i].FullName, "input.txt")); - string[] rawRegexes = File.ReadAllLines(Path.Combine(experiments[i].FullName, "regexes.txt")); - - WriteOutput($"\n---------- {experiments[i].Name} ----------"); - for (int r = 0; r < rawRegexes.Length; r++) - { - TestRunRegex((r + 1).ToString(), rawRegexes[r], input); - } - } - } - private static long MeasureMatchTime(Regex re, string input, out Match match) { try @@ -178,9 +124,14 @@ private static string And(params string[] regexes) /// private static string Not(string regex) => $"(?({regex})[0-[0]]|.*)"; - [ConditionalFact(nameof(Enabled))] + [Fact] public void ViewSampleRegexInDGML() { + if (!Enabled) + { + return; + } + try { //string rawregex = @"\bis\w*\b"; @@ -233,45 +184,6 @@ static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hid } } - private void TestRunRegex(string name, string rawregex, string input, bool viewDGML = false, bool dotStar = false) - { - var reNone = new Regex(rawregex, RegexOptions.None, new TimeSpan(0, 0, 10)); - var reCompiled = new Regex(rawregex, RegexOptions.Compiled, new TimeSpan(0, 0, 10)); - var reNonBacktracking = new Regex(rawregex, RegexOptions.NonBacktracking); - - if (viewDGML) - ViewDGML(reNonBacktracking, addDotStar: dotStar); - WriteOutput($"\n{name}"); - - // First call in each case is a warmup - - // None - MeasureMatchTime(reNone, input, out _); - long tN = MeasureMatchTime(reNone, input, out Match mN); - WriteMatchOutput(tN, mN); - - // Compiled - MeasureMatchTime(reCompiled, input, out _); - long tC = MeasureMatchTime(reCompiled, input, out Match mC); - WriteMatchOutput(tC, mC); - - // Non-Backtracking - MeasureMatchTime(reNonBacktracking, input, out _); - long tD = MeasureMatchTime(reNonBacktracking, input, out Match mD); - WriteMatchOutput(tD, mD); - - void WriteMatchOutput(long t, Match m) - { - WriteOutput(t switch - { - -1 => ",10000,TIMEOUT", - -2 => ",10000,ERROR", - _ when m.Success => $",{t},Yes({m.Index}:{m.Length})", - _ => $",{t},No" - }); - } - } - #region Tests involving Intersection and Complement // Currently only run in DEBUG mode in the NonBacktracking engine [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index ad5ca8d0754d9..a5dd31a5252fe 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -363,6 +363,7 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?i:abcde)|(?i:abcdf)", "(?i:abcd[ef])")] [InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")] [InlineData("bonjour|hej|ciao|shalom|zdravo|pozdrav|hallo|hola|hello|hey|witam|tere|bonjou|salam|helo|sawubona", "(?>bonjou(?>r|)|h(?>e(?>j|(?>l(?>lo|o)|y))|allo|ola)|ciao|s(?>halom|a(?>lam|wubona))|zdravo|pozdrav|witam|tere)")] + [InlineData("\\w\\d123|\\w\\dabc", "\\w\\d(?:123|abc)")] // Auto-atomicity [InlineData("a*b", "(?>a*)b")] [InlineData("a*b+", "(?>a*)b+")] @@ -384,6 +385,16 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?:w*)+\\.", "(?>w*)+\\.")] [InlineData("(a[bcd]e*)*fg", "(a[bcd](?>e*))*fg")] [InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")] + // IgnoreCase set creation + [InlineData("(?i)abcd", "[Aa][Bb][Cc][Dd]")] + [InlineData("(?i)abcd|efgh", "[Aa][Bb][Cc][Dd]|[Ee][Ff][Gg][Hh]")] + [InlineData("(?i)a|b", "[AaBb]")] + [InlineData("(?i)[abcd]", "[AaBbCcDd]")] + [InlineData("(?i)[acexyz]", "[AaCcEeXxYyZz]")] + [InlineData("(?i)\\w", "\\w")] + [InlineData("(?i)\\d", "\\d")] + [InlineData("(?i).", ".")] + [InlineData("(?i)\\$", "\\$")] public void PatternsReduceIdentically(string pattern1, string pattern2) { string result1 = GetRegexCodes(new Regex(pattern1)); @@ -394,10 +405,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) } Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2))); - if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:")) - { - Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2))); - } } [Theory] @@ -443,7 +450,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) // Not reducing branches of alternations with different casing [InlineData("(?i:abcd)|abcd", "abcd|abcd")] [InlineData("abcd|(?i:abcd)", "abcd|abcd")] - [InlineData("abc(?:(?i:e)|f)", "abc[ef]")] // Not applying auto-atomicity [InlineData("a*b*", "(?>a*)b*")] [InlineData("[ab]*[^a]", "(?>[ab]*)[^a]")]