From fdbffd26a4dd599059e4c44d1c053a1db65d73de Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Nov 2021 20:41:47 -0500 Subject: [PATCH 1/6] Factor out and improve the vectorization of RegexInterpreter.FindFirstChar This change started with the "simple" goal of factoring out the FindFirstChar logic from RegexInterpreter and consuming it in SymbolicRegexMatcher. The existing engines use FindFirstChar to quickly skip ahead to the next location that might possibly match, at which point they fall back to analyzing the whole pattern at that location. SymbolicRegexMatcher (used by RegexOptions.NonBacktracking) had its own implementation for this, which it used any time it entered a start state. This required non-trivial additional code to maintain, and there's no good reason it should be separate from the other engines. However, what started out as a simple change grew due to regressions that resulted from differences in the implementations. In particular, SymbolicRegexMatcher already works off of precomputed equivalence tables for casing, which gives it very different characteristics in this regard from the existing engines. For example, SymbolicRegexMatcher's existing "skip ahead to the next possible match start location" logic already evaluated all the characters that could possibly start a match, which included variations of the same character when using IgnoreCase, but the existing RegexInterpreter logic didn't. That discrepancy then results in a significant IgnoreCase regression for NonBacktracking due to losing the ability to use a vectorized search for the next starting location. We already plan to shift the existing engines over to a plan where all of these equivalences are computed at construction time rather than using ToLower at both construction time and match time, so this PR takes some steps in that direction, doing so for most of ASCII. This has added some temporary cruft, which we'll be able to delete once we fully shift the implementations over (which we should do in the near future). Another difference was SymbolicRegexMatcher was enabling use of IndexOfAny for up to 5 characters, whereas RegexOptions.Compiled was only doing up to 3 characters, and RegexInterpreter wasn't doing for any number. The PR now uses 5 everywhere. However, the more characters involved, the more overhead there is to IndexOfAny, and for some inputs, the higher the chances are that IndexOfAny will find a match sooner, which means its overhead compounds more. To help with that, we now not only compute the possible characters that might match at the beginning of the pattern, but also characters that might match at a fixed offset from the beginning of the pattern (e.g. in \d{3}-\d{2}-\d{4}, it will find the '-' at offset 3 and be able to vectorize a search for that and then back off by the relevant distance. That then also means we might end up with multiple sets to choose to search for, and this PR borrows an idea from Rust, which is to use some rough frequency analysis to determine which set should be targeted. It's not perfect, and we can update the texts use to seed the analysis (right now I based it primarily on *.cs files in dotnet/runtime and some Project Gutenberg texts), but it's good enough for these purposes for now. We'd previously switched to using IndexOf for a case-sensitive prefix string, but still were using Boyer-Moore for case-insensitive. Now that we're able to also vectorize a search for case-insensitive values (right now just ASCII letter, but that'll be fixed soon), we can just get rid of Boyer-Moore entirely. This saves all the costs to do with constructing the Boyer-Moore tables and also avoids having to generate the Boyer-Moore implementations in RegexOptions.Compiled and the source generator. The casing change also defeated some other optimizations already present. For example, in .NET 5 we added an optimization whereby an alternation like `abcef|abcgh` would be transformed into `abc(?:ef|gh)`, and that would apply whether case-sensitive or case-insensitive. But by transforming the expression at construction now for case-insensitive into `[Aa][Bb][Cc][Ee][Ff]|[Aa][Bb][Cc][Gg][Hh]`, that optimization was defeated. I've added a new optimization pass for alternations that will detect common prefixes even if they're sets. The casing change also revealed some cosmetic issues. As part of the change, when we encounter a "multi" (a multi-character string in the pattern), we convert that single case-insensitive RegexNode to instead be one case-sensitive RegexNode per character, with a set for all the equivalent characters that can match. This then defeats some of the nice formatting we had for multis in the source generator, so as part of this change, the source generator has been augmented to output nicer code for concatenations. And because sets like [Ee] are now way more common (since e.g. a case-insensitive 'e' will be transformed into such a set), we also special-case that in both the source generator and RegexOptions.Compiled, to spit out the equivalent of `(c | 0x20) == 'e'` rather than `(c == 'E'| c == 'e')`. Along the way, I cleaned up a few things as well, such as passing around a CultureInfo more rather than repeatedly calling CultureInfo.CurrentCulture, using CollectionsMarshal.GetValueRefOrAddDefault on a hot path to do with interning strings in a lookup table, tweaking SymbolicRegexRunnerFactory's Runner to itself be generic to avoid an extra layer of virtual dispatch per operation, and cleaning up code / comments in SymbolicRegexMatcher along the way. For the most part the purpose of the change wasn't to improve perf, and in fact I was willing to accept some regressions in the name of consolidation. There are a few regressions here, mostly small, and mostly for cases where we're simply paying an overhead for vectorization, e.g. where the current location is fine to match, or where the target character being searched for is very frequent. Overall, though, there are some substantial improvements. --- .../gen/RegexGenerator.Emitter.cs | 668 ++++--- .../gen/RegexGenerator.Parser.cs | 2 +- .../gen/Stubs.cs | 8 +- ...m.Text.RegularExpressions.Generator.csproj | 2 +- .../src/System.Text.RegularExpressions.csproj | 3 +- .../Text/RegularExpressions/Regex.Cache.cs | 2 +- .../System/Text/RegularExpressions/Regex.cs | 15 +- .../RegularExpressions/RegexBoyerMoore.cs | 404 ----- .../Text/RegularExpressions/RegexCharClass.cs | 350 +++- .../Text/RegularExpressions/RegexCode.cs | 45 +- .../Text/RegularExpressions/RegexCompiler.cs | 785 ++++---- .../RegexFindOptimizations.cs | 674 +++++++ .../RegularExpressions/RegexInterpreter.cs | 349 +--- .../RegularExpressions/RegexLWCGCompiler.cs | 3 - .../Text/RegularExpressions/RegexNode.cs | 219 ++- .../Text/RegularExpressions/RegexParser.cs | 161 +- .../RegularExpressions/RegexPrefixAnalyzer.cs | 674 +++++-- .../Text/RegularExpressions/RegexWriter.cs | 62 +- .../Symbolic/Algebras/BDD.cs | 1 + .../Symbolic/SymbolicRegexBuilder.cs | 2 - .../Symbolic/SymbolicRegexInfo.cs | 35 +- .../Symbolic/SymbolicRegexMatcher.cs | 418 ++--- .../Symbolic/SymbolicRegexRunnerFactory.cs | 49 +- .../Symbolic/SymbolicRegexSampler.cs | 14 +- .../Symbolic/Unicode/GeneratorHelper.cs | 2 + .../Unicode/IgnoreCaseRelationGenerator.cs | 2 + .../Unicode/UnicodeCategoryRangesGenerator.cs | 3 + .../src/System/Threading/StackHelper.cs | 16 + .../tests/Regex.Groups.Tests.cs | 1605 +++++++++-------- .../tests/Regex.Match.Tests.cs | 33 +- .../tests/Regex.Tests.Common.cs | 8 +- .../tests/RegexCultureTests.cs | 191 +- .../tests/RegexExperiment.cs | 112 +- .../tests/RegexReductionTests.cs | 15 +- 34 files changed, 3604 insertions(+), 3328 deletions(-) delete mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 734f3509f3843..835339bf8006e 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -231,10 +231,8 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, { RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - (string CharClass, bool CaseInsensitive)[]? lcc = code.LeadingCharClasses; bool rtl = code.RightToLeft; bool hasTextInfo = false; - bool textInfoEmitted = false; // Emit locals initialization writer.WriteLine("string runtext = base.runtext!;"); @@ -267,33 +265,62 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, }; using (EmitBlock(writer, clause)) { + // Some anchors help to advance the position but don't terminate the operation. + // As such, we do the anchors check first, and then treat them below the same + // as if there's no special searching enabled. EmitAnchors(); - if (code.BoyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm) - { - if (rbm.PatternSupportsIndexOf) - { - EmitIndexOf(rbm.Pattern); - } - else - { - EmitBoyerMoore(rbm); - } - } - else if (lcc is not null) - { - if (rtl) - { - EmitLeadingCharacter_RightToLeft(); - } - else - { - EmitLeadingCharacter_LeftToRight(); - } - } - else + // If whatever search operation we need to perform entails case-insensitive operations + // that weren't already handled via creation of sets, we need to get an store the + // TextInfo object to use (unless RegexOptions.CultureInvariant was specified). + EmitTextInfo(writer, ref hasTextInfo, rm); + + // Emit the code for whatever find mode has been determined. + switch (code.FindOptimizations.FindMode) { - writer.WriteLine("return true;"); + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_RightToLeft(code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_LeftToRight(); + break; + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_RightToLeft(); + break; + + // Already emitted earlier + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + Debug.Assert(code.FindOptimizations.LeadingAnchor != 0); + goto case FindNextStartingPositionMode.NoSearch; + + default: + Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); + goto case FindNextStartingPositionMode.NoSearch; + + case FindNextStartingPositionMode.NoSearch: + writer.WriteLine("return true;"); + break; } } writer.WriteLine(); @@ -306,12 +333,10 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, void EmitAnchors() { // Generate anchor checks. - if ((code.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) + if ((code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) { - // TODO: RegexInterpreter also factors in a Boyer-Moore prefix check in places Compiled just returns true. - // Determine if we should do so here and in Compiled as well, and potentially update RegexInterpreter. - // Interpreted and Compiled also differ in various places as to whether they update positions, as do LTR vs RTL. Determine why. - switch (code.LeadingAnchor) + // TODO: Interpreted and Compiled differ in various places as to whether they update positions, as do LTR vs RTL. Determine why. + switch (code.FindOptimizations.LeadingAnchor) { case RegexPrefixAnalyzer.Beginning: writer.WriteLine("// Beginning \\A anchor"); @@ -393,12 +418,12 @@ void EmitAnchors() writer.WriteLine("return true;"); return; - case RegexPrefixAnalyzer.Bol when !rtl: // Don't bother optimizing for the niche case of RegexOptions.RightToLeft | RegexOptions.Multiline + case RegexPrefixAnalyzer.Bol: // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any Boyer-Moore or - // leading char class searches. + // to boost our position to the next line, and then continue normally with any searches. + Debug.Assert(!rtl, "RightToLeft isn't implemented and should have been filtered out previously"); writer.WriteLine("// Beginning-of-line anchor"); using (EmitBlock(writer, "if (runtextpos > runtextbeg && runtext[runtextpos - 1] != '\\n')")) { @@ -415,219 +440,100 @@ void EmitAnchors() } } - void EmitBoyerMoore(RegexBoyerMoore rbm) + // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern. + void EmitIndexOf_LeftToRight(string prefix) { - EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm); - - int beforefirst; - int last; - if (!rtl) - { - //limitLocal = "runtextend"; - beforefirst = -1; - last = rbm.Pattern.Length - 1; - } - else - { - //limitLocal = "runtextbeg"; - beforefirst = rbm.Pattern.Length; - last = 0; - } - - int chLast = rbm.Pattern[last]; - - EmitAdd(writer, "runtextpos", !rtl ? rbm.Pattern.Length - 1 : -rbm.Pattern.Length); - - using (EmitBlock(writer, $"while ({(!rtl ? "runtextpos < runtextend" : "runtextpos >= runtextbeg")})")) - { - writer.WriteLine($"ch = {ToLowerIfNeeded(hasTextInfo, options, "runtext[runtextpos]", rbm.CaseInsensitive)};"); - - using (EmitBlock(writer, $"if (ch != {Literal((char)chLast)})")) - { - writer.WriteLine($"ch -= {Literal((char)rbm.LowASCII)};"); - using (EmitBlock(writer, $"if ((uint)ch > ({Literal((char)rbm.HighASCII)} - {Literal((char)rbm.LowASCII)}))")) - { - EmitAdd(writer, "runtextpos", (!rtl ? rbm.Pattern.Length : -rbm.Pattern.Length)); - writer.WriteLine("continue;"); - } - - int negativeRange = rbm.HighASCII - rbm.LowASCII + 1; - if (negativeRange > 1) // High > Low - { - // Create a string to store the lookup table we use to find the offset. - // Store the offsets into the string. RightToLeft has negative offsets, so to support it with chars (unsigned), we negate - // the values to be stored in the string, and then at run time after looking up the offset in the string, negate it again. - Debug.Assert(rbm.Pattern.Length <= char.MaxValue, "RegexBoyerMoore should have limited the size allowed."); - Span span = new char[negativeRange]; - for (int i = 0; i < span.Length; i++) - { - int offset = rbm.NegativeASCII[i + rbm.LowASCII]; - if (offset == beforefirst) - { - offset = rbm.Pattern.Length; - } - else if (rtl) - { - offset = -offset; - } - Debug.Assert(offset >= 0 && offset <= char.MaxValue); - span[i] = (char)offset; - } - - writer.WriteLine($"runtextpos {(rtl ? "-=" : "+=")} {Literal(span.ToString())}[ch];"); - } - else - { - Debug.Assert(negativeRange == 1); // High == Low - int offset = rbm.NegativeASCII[rbm.LowASCII]; - if (offset == beforefirst) - { - offset = rtl ? -rbm.Pattern.Length : rbm.Pattern.Length; - } - EmitAdd(writer, "runtextpos", offset); - } - writer.WriteLine("continue;"); - } - writer.WriteLine(); - writer.WriteLine("int test = runtextpos;"); - writer.WriteLine(); - - for (int i = rbm.Pattern.Length - 2; i >= 0; i--) - { - int charIndex = !rtl ? i : rbm.Pattern.Length - 1 - i; - bool sameAsPrev = i < rbm.Pattern.Length - 2 && rbm.Positive[charIndex] == rbm.Positive[!rtl ? i + 1 : rbm.Pattern.Length - 1 - (i + 1)]; - bool sameAsNext = i > 0 && rbm.Positive[charIndex] == rbm.Positive[!rtl ? i - 1 : rbm.Pattern.Length - 1 - (i - 1)]; - - string condition = $"{ToLowerIfNeeded(hasTextInfo, options, (!rtl ? "runtext[--test]" : "runtext[++test]"), rbm.CaseInsensitive && RegexCharClass.ParticipatesInCaseConversion(rbm.Pattern[charIndex]))} != {Literal(rbm.Pattern[charIndex])}"; - switch ((sameAsPrev, sameAsNext)) - { - case (true, true): - writer.WriteLine($" {condition} ||"); - break; - - case (false, true): - writer.WriteLine($"if ({condition} ||"); - break; - - case (true, false): - writer.WriteLine($" {condition})"); - using (EmitBlock(writer, null)) - { - EmitAdd(writer, "runtextpos", rbm.Positive[charIndex]); - writer.WriteLine("continue;"); - } - writer.WriteLine(); - break; - - case (false, false): - using (EmitBlock(writer, $"if ({condition})")) - { - EmitAdd(writer, "runtextpos", rbm.Positive[charIndex]); - writer.WriteLine("continue;"); - } - writer.WriteLine(); - break; - } - } - - writer.WriteLine(!rtl ? - "base.runtextpos = test;" : - "base.runtextpos = test + 1;"); - writer.WriteLine("return true;"); - } + writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos), {Literal(prefix)});"); + writer.WriteLine("if (i >= 0)"); + writer.WriteLine("{"); + writer.WriteLine(" base.runtextpos = runtextpos + i;"); + writer.WriteLine(" return true;"); + writer.WriteLine("}"); } - void EmitIndexOf(string prefix) + // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. + void EmitIndexOf_RightToLeft(string prefix) { - writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos), {Literal(prefix)});"); + writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextbeg, runtextpos - runtextbeg), {Literal(prefix)});"); writer.WriteLine("if (i >= 0)"); writer.WriteLine("{"); - writer.WriteLine(" base.runtextpos = runtextpos + i;"); + writer.WriteLine($" base.runtextpos = runtextbeg + i + {prefix.Length};"); writer.WriteLine(" return true;"); writer.WriteLine("}"); } - void EmitLeadingCharacter_RightToLeft() + // Emits a right-to-left search for a set at a fixed position from the start of the pattern. + // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) + void EmitFixedSet_RightToLeft() { - EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm); + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = code.FindOptimizations.FixedDistanceSets![0]; + Debug.Assert(set.Distance == 0); - Debug.Assert(lcc.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft"); - string set = lcc[0].CharClass; - if (RegexCharClass.IsSingleton(set)) + if (set.Chars is { Length: 1 } && !set.CaseInsensitive) { - char ch = RegexCharClass.SingletonChar(set); - using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)")) - { - using (EmitBlock(writer, $"if (runtext[i] == {ToLowerIfNeeded(hasTextInfo, options, Literal(ch), lcc[0].CaseInsensitive)})")) - { - writer.WriteLine("base.runtextpos = i + 1;"); - writer.WriteLine("return true;"); - } - } + writer.WriteLine($"int i = global::System.MemoryExtensions.LastIndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextbeg, runtextpos - runtextbeg), {Literal(set.Chars[0])});"); + writer.WriteLine("if (i >= 0)"); + writer.WriteLine("{"); + writer.WriteLine(" base.runtextpos = runtextbeg + i + 1;"); + writer.WriteLine(" return true;"); + writer.WriteLine("}"); } else { using (EmitBlock(writer, "for (int i = runtextpos - 1; i >= runtextbeg; i--)")) { - using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtext[i]", set, lcc[0].CaseInsensitive)})")) + using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "runtext[i]", set.Set, set.CaseInsensitive)})")) { - writer.WriteLine("runtextpos = i + 1;"); + writer.WriteLine("base.runtextpos = i + 1;"); writer.WriteLine("return true;"); } } } } - void EmitLeadingCharacter_LeftToRight() + // Emits a left-to-right search for a set at a fixed position from the start of the pattern, + // and potentially other sets at other fixed positions in the pattern. + void EmitFixedSet_LeftToRight() { - Debug.Assert(lcc is not null && lcc.Length > 0); - - // If minRequiredLength > 0, we already output a more stringent check. In the rare case - // where we were unable to get an accurate enough min required length to ensure it's larger - // than the prefixes we calculated, we also need to ensure we have enough space for those, - // as they also represent a min required length. - if (minRequiredLength < lcc.Length) - { - writer.WriteLine($"// Validate at least {lcc.Length} characters are available to match"); - string endExpr = lcc.Length > 1 ? $"runtextend - {lcc.Length - 1}" : "runtextend"; - using (EmitBlock(writer, $"if (runtextpos >= {endExpr})")) - { - writer.WriteLine("goto ReturnFalse;"); - } - writer.WriteLine(); - } - - writer.WriteLine("global::System.ReadOnlySpan span = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);"); + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets; + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; + const int MaxSets = 4; + int setsToUse = Math.Min(sets.Count, MaxSets); // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. - Span setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below - int setCharsCount = 0, charClassIndex = 0; - bool canUseIndexOf = - !lcc[0].CaseInsensitive && - (setCharsCount = RegexCharClass.GetSetChars(lcc[0].CharClass, setChars)) > 0 && - !RegexCharClass.IsNegated(lcc[0].CharClass); - bool needLoop = !canUseIndexOf || lcc.Length > 1; + int setIndex = 0; + bool canUseIndexOf = !primarySet.CaseInsensitive && primarySet.Chars is not null; + bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitScope loopBlock = default; if (needLoop) { - EmitTextInfoIfRequired(writer, ref textInfoEmitted, ref hasTextInfo, rm); - writer.WriteLine(); - string upperBound = lcc.Length > 1 ? $"span.Length - {lcc.Length - 1}" : "span.Length"; + writer.WriteLine("global::System.ReadOnlySpan span = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);"); + string upperBound = "span.Length" + (setsToUse > 1 || primarySet.Distance != 0 ? $" - {minRequiredLength - 1}" : ""); loopBlock = EmitBlock(writer, $"for (int i = 0; i < {upperBound}; i++)"); } if (canUseIndexOf) { - charClassIndex = 1; + string span = needLoop ? + "span" : + "global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos)"; - string span = needLoop ? "span.Slice(i)" : "span"; - string indexOf = setCharsCount switch + span = (needLoop, primarySet.Distance) switch { - 1 => $"global::System.MemoryExtensions.IndexOf({span}, {Literal(setChars[0])})", - 2 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(setChars[0])}, {Literal(setChars[1])})", - _ => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})", + (false, 0) => span, + (true, 0) => $"{span}.Slice(i)", + (false, _) => $"{span}.Slice({primarySet.Distance})", + (true, _) => $"{span}.Slice(i + {primarySet.Distance})", + }; + + string indexOf = primarySet.Chars!.Length switch + { + 1 => $"global::System.MemoryExtensions.IndexOf({span}, {Literal(primarySet.Chars[0])})", + 2 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", + 3 => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", + _ => $"global::System.MemoryExtensions.IndexOfAny({span}, {Literal(new string(primarySet.Chars))})", }; if (needLoop) @@ -640,60 +546,93 @@ void EmitLeadingCharacter_LeftToRight() writer.WriteLine("i += indexOfPos;"); writer.WriteLine(); - if (lcc.Length > 1) + if (setsToUse > 1) { - using (EmitBlock(writer, $"if (i >= span.Length - {lcc.Length - 1})")) + using (EmitBlock(writer, $"if (i >= span.Length - {minRequiredLength - 1})")) { writer.WriteLine("goto ReturnFalse;"); } + writer.WriteLine(); } } else { writer.WriteLine($"int i = {indexOf};"); - using (EmitBlock(writer, "if (i < 0)")) + using (EmitBlock(writer, "if (i >= 0)")) { - writer.WriteLine("goto ReturnFalse;"); + writer.WriteLine("base.runtextpos = runtextpos + i;"); + writer.WriteLine("return true;"); } } - writer.WriteLine(); + + setIndex = 1; } - Debug.Assert(charClassIndex == 0 || charClassIndex == 1); - bool hasCharClassConditions = false; - if (charClassIndex < lcc.Length) + if (needLoop) { - // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") && - // ...) - Debug.Assert(needLoop); - int start = charClassIndex; - for (; charClassIndex < lcc.Length; charClassIndex++) + Debug.Assert(setIndex == 0 || setIndex == 1); + bool hasCharClassConditions = false; + if (setIndex < setsToUse) { - string spanIndex = charClassIndex > 0 ? $"span[i + {charClassIndex}]" : "span[i]"; - string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, lcc[charClassIndex].CharClass, lcc[charClassIndex].CaseInsensitive); - - if (charClassIndex == start) + // if (CharInClass(textSpan[i + charClassIndex], prefix[0], "...") && + // ...) + Debug.Assert(needLoop); + int start = setIndex; + for (; setIndex < setsToUse; setIndex++) { - writer.Write($"if ({charInClassExpr}"); - } - else - { - writer.WriteLine(" &&"); - writer.Write($" {charInClassExpr}"); + string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]"; + string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive); + + if (setIndex == start) + { + writer.Write($"if ({charInClassExpr}"); + } + else + { + writer.WriteLine(" &&"); + writer.Write($" {charInClassExpr}"); + } } + writer.WriteLine(")"); + hasCharClassConditions = true; } - writer.WriteLine(")"); - hasCharClassConditions = true; - } - using (hasCharClassConditions ? EmitBlock(writer, null) : default) - { - writer.WriteLine("base.runtextpos = runtextpos + i;"); - writer.WriteLine("return true;"); + using (hasCharClassConditions ? EmitBlock(writer, null) : default) + { + writer.WriteLine("base.runtextpos = runtextpos + i;"); + writer.WriteLine("return true;"); + } } loopBlock.Dispose(); } + + // If a TextInfo is needed to perform ToLower operations, emits a local initialized to the TextInfo to use. + static void EmitTextInfo(IndentedTextWriter writer, ref bool hasTextInfo, RegexMethod rm) + { + // Emit local to store current culture if needed + if ((rm.Options & RegexOptions.CultureInvariant) == 0) + { + bool needsCulture = rm.Code.FindOptimizations.FindMode switch + { + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or + FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true, + + _ when rm.Code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), + + _ => false, + }; + + if (needsCulture) + { + hasTextInfo = true; + writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;"); + } + } + } } /// Emits the body of the Go override. @@ -750,9 +689,12 @@ private static void EmitNonBacktrackingGo(IndentedTextWriter writer, RegexMethod /// Emits the body of a simplified Go implementation that's possible when there's minimal backtracking required by the expression. private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, string id) { + // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated + // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. + const int MaxUnrollSize = 16; + RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - (string CharClass, bool CaseInsensitive)[]? lcc = code.LeadingCharClasses; bool rtl = code.RightToLeft; bool hasTimeout = false; @@ -1267,26 +1209,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Concatenate: - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) - { - EmitSpanLengthCheck(requiredLength); - writer.WriteLine(); - - for (; i < exclusiveEnd; i++) - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); - } - - i--; - } - else - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); - } - } + EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; case RegexNode.Capture: @@ -1360,8 +1283,91 @@ void EmitUpdateBumpalong() writer.WriteLine("base.runtextpos = runtextpos;"); } + void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) + { + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + { + bool wroteClauses = true; + writer.Write($"if ({SpanLengthCheck(requiredLength)}"); + + while (i < exclusiveEnd) + { + for (; i < exclusiveEnd; i++) + { + void WriteSingleCharChild(RegexNode child) + { + if (wroteClauses) + { + writer.WriteLine(" ||"); + writer.Write(" "); + } + else + { + writer.Write("if ("); + } + EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true); + wroteClauses = true; + } + + RegexNode child = node.Child(i); + if (child.Type is RegexNode.One or RegexNode.Notone or RegexNode.Set) + { + WriteSingleCharChild(child); + writer.Write($" /* {DescribeNode(child)} */"); + } + else if (child.Type is RegexNode.Oneloop or RegexNode.Onelazy or RegexNode.Oneloopatomic or + RegexNode.Setloop or RegexNode.Setlazy or RegexNode.Setloopatomic or + RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic && + child.M == child.N && + child.M <= MaxUnrollSize) + { + for (int c = 0; c < child.M; c++) + { + WriteSingleCharChild(child); + if (c == 0) + { + writer.Write($" /* {DescribeNode(child)} */"); + } + } + } + else + { + break; + } + } + + if (wroteClauses) + { + writer.WriteLine(")"); + using (EmitBlock(writer, null)) + { + writer.WriteLine($"goto {doneLabel};"); + } + wroteClauses = false; + } + + if (i < exclusiveEnd) + { + writer.WriteLine(); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); + i++; + } + } + + i--; + } + else + { + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); + } + } + } + // Emits the code to handle a single-character match. - void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null) + void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null, bool clauseOnly = false) { // This only emits a single check, but it's called from the looping constructs in a loop // to generate the code for a single check, so we map those looping constructs to the @@ -1375,13 +1381,20 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); expr = $"{expr} {(node.IsOneFamily ? "!=" : "==")} {Literal(node.Ch)}"; } - using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) + if (clauseOnly) { - writer.WriteLine($"goto {doneLabel};"); + writer.Write(expr); + } + else + { + using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) + { + writer.WriteLine($"goto {doneLabel};"); + } } textSpanPos++; @@ -1685,10 +1698,6 @@ void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true) EmitSpanLengthCheck(iterations); } - // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated - // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. - const int MaxUnrollSize = 16; - if (iterations <= MaxUnrollSize) { // if (textSpan[textSpanPos] != c1 || @@ -1771,13 +1780,13 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = int minIterations = node.M; int maxIterations = node.N; - Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny + Span setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today int numSetChars = 0; string iterationLocal = NextLocalName("i"); if (node.IsNotoneFamily && maxIterations == int.MaxValue && - (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) + (!IsCaseInsensitive(node))) { // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, @@ -1802,21 +1811,25 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0 && RegexCharClass.IsNegated(node.Str!)) { - // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would + // If the set is negated and contains only a few characters (if it contained 1 and was negated, it should // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. + Debug.Assert(numSetChars > 1); writer.Write($"int {iterationLocal} = global::System.MemoryExtensions.IndexOfAny({textSpanLocal}"); if (textSpanPos != 0) { writer.Write($".Slice({textSpanPos})"); } - writer.WriteLine(numSetChars == 2 ? - $", {Literal(setChars[0])}, {Literal(setChars[1])});" : - $", {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});"); + writer.WriteLine(numSetChars switch + { + 2 => $", {Literal(setChars[0])}, {Literal(setChars[1])});", + 3 => $", {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});", + _ => $", {Literal(setChars.Slice(0, numSetChars).ToString())});", + }); using (EmitBlock(writer, $"if ({iterationLocal} == -1)")) { writer.WriteLine(textSpanPos > 0 ? @@ -1844,7 +1857,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } @@ -1895,7 +1908,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } @@ -2538,7 +2551,7 @@ void EmitOneCode(string? label) clause += Code() == RegexCode.Set ? $"!{MatchCharacterClass(hasTextInfo, options, expr, rm.Code.Strings[Operand(0)], IsCaseInsensitive())}" : - $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0)))} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}"; + $"{ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive())} {(Code() == RegexCode.One ? "!=" : "==")} {Operand(0)}"; using (EmitBlock(writer, $"if ({clause})")) { @@ -2555,7 +2568,7 @@ void EmitOneCode(string? label) writer.WriteLine($"if (runtextend - runtextpos < {str.Length} ||"); for (int i = 0; i < str.Length; i++) { - writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))} != {Literal(str[i])}"); + writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos{(i == 0 ? "" : $" + {i}")}]", IsCaseInsensitive())} != {Literal(str[i])}"); writer.WriteLine(i < str.Length - 1 ? " ||" : ")"); } using (EmitBlock(writer, null)) @@ -2575,7 +2588,7 @@ void EmitOneCode(string? label) for (int i = str.Length; i > 0;) { i--; - writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i]))} != {Literal(str[i])}"); + writer.Write($" {ToLowerIfNeeded(hasTextInfo, options, $"runtext[runtextpos - {str.Length - i}]", IsCaseInsensitive())} != {Literal(str[i])}"); writer.WriteLine(i == 0 ? ")" : " ||"); } using (EmitBlock(writer, null)) @@ -2661,7 +2674,7 @@ void EmitOneCode(string? label) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); expr = $"{expr} {(Code() == RegexCode.Onerep ? "!=" : "==")} {Literal((char)Operand(0))}"; } @@ -2708,7 +2721,7 @@ void EmitOneCode(string? label) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); string op = Code() == RegexCode.Onerep ? "!=" : "=="; using (EmitBlock(writer, $"if ({expr} {op} {Literal((char)Operand(0))})")) { @@ -2769,14 +2782,14 @@ void EmitOneCode(string? label) } string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? rm.Code.Strings[Operand(0)] : null; - Span setChars = stackalloc char[3]; + Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars; // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive, // we can use the vectorized IndexOf to search for the target character. if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) && !IsRightToLeft() && - (!IsCaseInsensitive() || !RegexCharClass.ParticipatesInCaseConversion(Operand(0)))) + !IsCaseInsensitive()) { writer.WriteLine($"{I} = global::System.MemoryExtensions.IndexOf(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal((char)Operand(0))}); // i"); using (EmitBlock(writer, $"if ({I} == -1)")) @@ -2793,20 +2806,19 @@ void EmitOneCode(string? label) else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && !IsRightToLeft() && !IsCaseInsensitive() && - (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 && RegexCharClass.IsNegated(set!)) { // Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive, - // and if the set contains only 2 or 3 negated chars, we can use the vectorized IndexOfAny + // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny // to search for those chars. - - Debug.Assert(numSetChars is 2 or 3); - writer.Write($"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}"); - if (numSetChars == 3) + Debug.Assert(numSetChars > 1); + writer.WriteLine(numSetChars switch { - writer.Write($", {Literal(setChars[2])}"); - } - writer.WriteLine("); // i"); + 2 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}); // i", + 3 => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])}); // i", + _ => $"{I} = global::System.MemoryExtensions.IndexOfAny(global::System.MemoryExtensions.AsSpan(runtext, runtextpos, {Len}), {Literal(setChars.Slice(0, numSetChars).ToString())}); // i", + }); using (EmitBlock(writer, $"if ({I} == -1)")) { writer.WriteLine($"runtextpos += {Len};"); @@ -2846,7 +2858,7 @@ void EmitOneCode(string? label) else { string op = Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic ? "!=" : "=="; - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); expr = $"{expr} {op} {Literal((char)Operand(0))}"; } @@ -2975,7 +2987,7 @@ void EmitOneCode(string? label) } else { - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))); + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive()); expr = $"{expr} {(Code() == RegexCode.Onelazy ? "!=" : "==")} {Literal((char)Operand(0))}"; } @@ -3223,40 +3235,6 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression; - private static void EmitTextInfoIfRequired(IndentedTextWriter writer, ref bool textInfoEmitted, ref bool hasTextInfo, RegexMethod rm) - { - if (textInfoEmitted) - { - return; - } - textInfoEmitted = true; - - // Emit local to store current culture if needed - if ((((RegexOptions)rm.Options) & RegexOptions.CultureInvariant) == 0) - { - bool needsCulture = (((RegexOptions)rm.Options) & RegexOptions.IgnoreCase) != 0 || rm.Code.BoyerMoorePrefix?.CaseInsensitive == true; - if (!needsCulture && rm.Code.LeadingCharClasses is not null) - { - for (int i = 0; i < rm.Code.LeadingCharClasses.Length; i++) - { - if (rm.Code.LeadingCharClasses[i].CaseInsensitive) - { - needsCulture = true; - break; - } - } - } - - if (needsCulture) - { - hasTextInfo = true; - writer.WriteLine("// IgnoreCase with CultureInfo.CurrentCulture"); - writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;"); - writer.WriteLine(); - } - } - } - private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive) { // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass), @@ -3319,22 +3297,32 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options return $"(char.GetUnicodeCategory({chExpr}) {(negated ? "!=" : "==")} global::System.Globalization.UnicodeCategory.{category})"; } - // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), - // it's cheaper and smaller to compare against each than it is to use a lookup table. - if (!invariant) + // Next, if there's only 2, 3, or 4 chars in the set (fairly common due to the sets we create for prefixes), + // it may be cheaper and smaller to compare against each than it is to use a lookup table. We can also special-case + // the very common case with case insensitivity of two characters next to each other being the upper and lowercase + // ASCII variants of each other, in which case we can use bit manipulation to avoid a comparison. + if (!invariant && !RegexCharClass.IsNegated(charClass)) { - Span setChars = stackalloc char[3]; - int numChars = RegexCharClass.GetSetChars(charClass, setChars); - if (!RegexCharClass.IsNegated(charClass)) + Span setChars = stackalloc char[4]; + switch (RegexCharClass.GetSetChars(charClass, setChars)) { - switch (numChars) - { - case 2: - return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; + case 2: + return (setChars[0] | 0x20) == setChars[1] ? + $"(({chExpr} | 0x20) == {Literal(setChars[1])})" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; - case 3: - return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))"; - } + case 3: + return (setChars[0] | 0x20) == setChars[1] ? + $"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))"; + + case 4: + if (((setChars[0] | 0x20) == setChars[1]) && + ((setChars[2] | 0x20) == setChars[3])) + { + return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))"; + } + break; } } diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs index 045dbd5467968..cb3bed4d27fa2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs @@ -178,7 +178,7 @@ private static bool IsSyntaxTargetForGeneration(SyntaxNode node) => RegexCode code; try { - code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture)); + code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture), culture); } catch (Exception e) { diff --git a/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs b/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs index aefab1dc1b057..13626a4be5a3b 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/Stubs.cs @@ -54,13 +54,13 @@ namespace System.Threading { internal static class InterlockedExtensions { - public static int Or(ref int location1, int value) + public static uint Or(ref uint location1, uint value) { - int current = location1; + uint current = location1; while (true) { - int newValue = current | value; - int oldValue = Interlocked.CompareExchange(ref location1, newValue, current); + uint newValue = current | value; + uint oldValue = (uint)Interlocked.CompareExchange(ref Unsafe.As(ref location1), (int)newValue, (int)current); if (oldValue == current) { return oldValue; diff --git a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj index 7f59e37493cd8..8e1ec70d99d6c 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj +++ b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj @@ -30,10 +30,10 @@ - + diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index da0f0edd7c0fa..8537fd70de527 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -23,11 +23,11 @@ - + @@ -100,6 +100,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs index edcb23b1cdba9..66b1a8108c4ab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs @@ -128,7 +128,7 @@ public static Regex GetOrAdd(string pattern, RegexOptions options, TimeSpan matc Regex.ValidateOptions(options); Regex.ValidateMatchTimeout(matchTimeout); - CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + CultureInfo culture = RegexParser.GetTargetCulture(options); Key key = new Key(pattern, culture.ToString(), options, matchTimeout); Regex? regex = Get(key); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 85d575ccaf193..ee276b33deb75 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -65,12 +65,12 @@ internal Regex(string pattern, CultureInfo? culture) // Call Init directly rather than delegating to a Regex ctor that takes // options to enable linking / tree shaking to remove the Regex compiler // and NonBacktracking implementation if it's not used. - Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture); + Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture ?? CultureInfo.CurrentCulture); } internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture) { - culture ??= GetTargetCulture(options); + culture ??= RegexParser.GetTargetCulture(options); Init(pattern, options, matchTimeout, culture); if ((options & RegexOptions.NonBacktracking) != 0) @@ -87,10 +87,6 @@ internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, Cult } } - /// Gets the culture to use based on the specified options. - private static CultureInfo GetTargetCulture(RegexOptions options) => - (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - /// Initializes the instance. /// /// This is separated out of the constructor so that an app only using 'new Regex(pattern)' @@ -98,7 +94,7 @@ private static CultureInfo GetTargetCulture(RegexOptions options) => /// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used. /// [MemberNotNull(nameof(_code))] - private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture) + private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { ValidatePattern(pattern); ValidateOptions(options); @@ -107,7 +103,6 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C this.pattern = pattern; internalMatchTimeout = matchTimeout; roptions = options; - culture ??= GetTargetCulture(options); #if DEBUG if (IsDebug) @@ -121,7 +116,7 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C // Generate the RegexCode from the node tree. This is required for interpreting, // and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking. - _code = RegexWriter.Write(tree); + _code = RegexWriter.Write(tree, culture); if ((options & RegexOptions.NonBacktracking) != 0) { @@ -434,7 +429,7 @@ internal void Run(string input, int startat, ref TState state, MatchCall /// Creates a new runner instance. private RegexRunner CreateRunner() => factory?.CreateInstance() ?? - new RegexInterpreter(_code!, GetTargetCulture(roptions)); + new RegexInterpreter(_code!, RegexParser.GetTargetCulture(roptions)); /// True if the option was set. protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs deleted file mode 100644 index 7fc3fb1edf3ba..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs +++ /dev/null @@ -1,404 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -// The RegexBoyerMoore object precomputes the Boyer-Moore -// tables for fast string scanning. These tables allow -// you to scan for the first occurrence of a string within -// a large body of text without examining every character. -// The performance of the heuristic depends on the actual -// string and the text being searched, but usually, the longer -// the string that is being searched for, the fewer characters -// need to be examined. - -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.Globalization; - -namespace System.Text.RegularExpressions -{ - internal sealed class RegexBoyerMoore - { - public readonly int[] Positive; - public readonly int[] NegativeASCII; - public readonly int[][]? NegativeUnicode; - public readonly string Pattern; - public readonly int LowASCII; - public readonly int HighASCII; - public readonly bool RightToLeft; - public readonly bool CaseInsensitive; - private readonly CultureInfo _culture; - - /// The maximum prefix string length for which we'll attempt to create a Boyer-Moore table. - /// This is limited in order to minimize the overhead of constructing a Regex. - public const int MaxLimit = 50_000; // must be <= char.MaxValue for RegexCompiler to compile Boyer-Moore correctly - - /// - /// Constructs a Boyer-Moore state machine for searching for the string - /// pattern. The string must not be zero-length. - /// - public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) - { - // Sorry, you just can't use Boyer-Moore to find an empty pattern. - // We're doing this for your own protection. (Really, for speed.) - Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); - Debug.Assert(pattern.Length <= MaxLimit, "RegexBoyerMoore can take a long time for large patterns"); -#if DEBUG - if (caseInsensitive) - { - foreach (char c in pattern) - { - // We expect each individual character to have been lower-cased. We don't validate the whole - // string at once because the rest of the library doesn't currently recognize/support surrogate pairs. - Debug.Assert(c == culture.TextInfo.ToLower(c), "Pattern wasn't lowercased with provided culture"); - } - } -#endif - - Pattern = pattern; - RightToLeft = rightToLeft; - CaseInsensitive = caseInsensitive; - _culture = culture; - - int beforefirst; - int last; - int bump; - - if (!rightToLeft) - { - beforefirst = -1; - last = pattern.Length - 1; - bump = 1; - } - else - { - beforefirst = pattern.Length; - last = 0; - bump = -1; - } - - // PART I - the good-suffix shift table - // - // compute the positive requirement: - // if char "i" is the first one from the right that doesn't match, - // then we know the matcher can advance by _positive[i]. - // - // This algorithm is a simplified variant of the standard - // Boyer-Moore good suffix calculation. - - Positive = new int[pattern.Length]; - - int examine = last; - char ch = pattern[examine]; - Positive[examine] = bump; - examine -= bump; - int scan; - int match; - - while (true) - { - // find an internal char (examine) that matches the tail - - while (true) - { - if (examine == beforefirst) - goto OuterloopBreak; - if (pattern[examine] == ch) - break; - examine -= bump; - } - - match = last; - scan = examine; - - // find the length of the match - - while (true) - { - if (scan == beforefirst || pattern[match] != pattern[scan]) - { - // at the end of the match, note the difference in _positive - // this is not the length of the match, but the distance from the internal match - // to the tail suffix. - if (Positive[match] == 0) - Positive[match] = match - scan; - - break; - } - - scan -= bump; - match -= bump; - } - - examine -= bump; - } - - OuterloopBreak: - - match = last - bump; - - // scan for the chars for which there are no shifts that yield a different candidate - - - // The inside of the if statement used to say - // "_positive[match] = last - beforefirst;" - // This is slightly less aggressive in how much we skip, but at worst it - // should mean a little more work rather than skipping a potential match. - while (match != beforefirst) - { - if (Positive[match] == 0) - Positive[match] = bump; - - match -= bump; - } - - // PART II - the bad-character shift table - // - // compute the negative requirement: - // if char "ch" is the reject character when testing position "i", - // we can slide up by _negative[ch]; - // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) - // - // the lookup table is divided into ASCII and Unicode portions; - // only those parts of the Unicode 16-bit code set that actually - // appear in the string are in the table. (Maximum size with - // Unicode is 65K; ASCII only case is 512 bytes.) - - NegativeASCII = new int[128]; - - for (int i = 0; i < 128; i++) - NegativeASCII[i] = last - beforefirst; - - LowASCII = 127; - HighASCII = 0; - - for (examine = last; examine != beforefirst; examine -= bump) - { - ch = pattern[examine]; - - if (ch < 128) - { - if (LowASCII > ch) - LowASCII = ch; - - if (HighASCII < ch) - HighASCII = ch; - - if (NegativeASCII[ch] == last - beforefirst) - NegativeASCII[ch] = last - examine; - } - else - { - int i = ch >> 8; - int j = ch & 0xFF; - - if (NegativeUnicode == null) - { - NegativeUnicode = new int[256][]; - } - - if (NegativeUnicode[i] == null) - { - int[] newarray = new int[256]; - - for (int k = 0; k < newarray.Length; k++) - newarray[k] = last - beforefirst; - - if (i == 0) - { - Array.Copy(NegativeASCII, newarray, 128); - NegativeASCII = newarray; - } - - NegativeUnicode[i] = newarray; - } - - if (NegativeUnicode[i][j] == last - beforefirst) - NegativeUnicode[i][j] = last - examine; - } - } - } - - // TODO: We should be able to avoid producing the RegexBoyerMoore instance - // entirely if we're going to go down the code path of using IndexOf. That will - // require some refactoring, though. - - /// Gets whether IndexOf could be used to perform the match. - public bool PatternSupportsIndexOf => - !RightToLeft && (!CaseInsensitive || !RegexCharClass.ParticipatesInCaseConversion(Pattern)); - - /// - /// When a regex is anchored, we can do a quick IsMatch test instead of a Scan - /// - public bool IsMatch(string text, int index, int beglimit, int endlimit) - { - if (!RightToLeft) - { - if (index < beglimit || endlimit - index < Pattern.Length) - return false; - } - else - { - if (index > endlimit || index - beglimit < Pattern.Length) - return false; - - index -= Pattern.Length; - } - - if (CaseInsensitive) - { - TextInfo textinfo = _culture.TextInfo; - - for (int i = 0; i < Pattern.Length; i++) - { - if (Pattern[i] != textinfo.ToLower(text[index + i])) - { - return false; - } - } - - return true; - } - - return Pattern.AsSpan().SequenceEqual(text.AsSpan(index, Pattern.Length)); - } - - /// - /// Scan uses the Boyer-Moore algorithm to find the first occurrence - /// of the specified string within text, beginning at index, and - /// constrained within beglimit and endlimit. - /// - /// The direction and case-sensitivity of the match is determined - /// by the arguments to the RegexBoyerMoore constructor. - /// - public int Scan(string text, int index, int beglimit, int endlimit) - { - int defadv; - int test; - int startmatch; - int endmatch; - int bump; - - if (!RightToLeft) - { - defadv = Pattern.Length; - startmatch = Pattern.Length - 1; - endmatch = 0; - test = index + defadv - 1; - bump = 1; - } - else - { - defadv = -Pattern.Length; - startmatch = 0; - endmatch = -defadv - 1; - test = index + defadv; - bump = -1; - } - - char chMatch = Pattern[startmatch]; - char chTest; - int test2; - int match; - int advance; - int[] unicodeLookup; - - while (true) - { - if (test >= endlimit || test < beglimit) - return -1; - - chTest = text[test]; - - if (CaseInsensitive) - chTest = _culture.TextInfo.ToLower(chTest); - - if (chTest != chMatch) - { - if (chTest < 128) - advance = NegativeASCII[chTest]; - else if (null != NegativeUnicode && (null != (unicodeLookup = NegativeUnicode[chTest >> 8]))) - advance = unicodeLookup[chTest & 0xFF]; - else - advance = defadv; - - test += advance; - } - else - { // if (chTest == chMatch) - test2 = test; - match = startmatch; - - while (true) - { - if (match == endmatch) - return (RightToLeft ? test2 + 1 : test2); - - match -= bump; - test2 -= bump; - - chTest = text[test2]; - - if (CaseInsensitive) - chTest = _culture.TextInfo.ToLower(chTest); - - if (chTest != Pattern[match]) - { - advance = Positive[match]; - if ((chTest & 0xFF80) == 0) - test2 = (match - startmatch) + NegativeASCII[chTest]; - else if (null != NegativeUnicode && (null != (unicodeLookup = NegativeUnicode[chTest >> 8]))) - test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; - else - { - test += advance; - break; - } - - if (RightToLeft ? test2 < advance : test2 > advance) - advance = test2; - - test += advance; - break; - } - } - } - } - } - -#if DEBUG - /// Used when dumping for debugging. - [ExcludeFromCodeCoverage] - public override string ToString() => Dump(string.Empty); - - [ExcludeFromCodeCoverage] - public string Dump(string indent) - { - var sb = new StringBuilder(); - - sb.AppendLine($"{indent}BM Pattern: {Pattern}"); - - sb.Append($"{indent}Positive: "); - foreach (int i in Positive) - { - sb.Append($"{i} "); - } - sb.AppendLine(); - - if (NegativeASCII != null) - { - sb.Append($"{indent}Negative table: "); - for (int i = 0; i < NegativeASCII.Length; i++) - { - if (NegativeASCII[i] != Pattern.Length) - { - sb.Append($" {{{Regex.Escape(((char)i).ToString())} {NegativeASCII[i]}}}"); - } - } - } - sb.AppendLine(); - - return sb.ToString(); - } -#endif - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 1c2cf0ff65817..c60ea9b04de15 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; +using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions @@ -71,6 +72,7 @@ internal sealed partial class RegexCharClass internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet; internal const string AnyClass = "\x00\x01\x00\x00"; + private const string EmptyClass = "\x00\x00\x00"; // UnicodeCategory is zero based, so we add one to each value and subtract it off later private const int DefinedCategoriesCapacity = 38; @@ -876,7 +878,7 @@ public static bool ParticipatesInCaseConversion(int comparison) /// Gets whether the specified string participates in case conversion. /// The string participates in case conversion if any of its characters do. - public static bool ParticipatesInCaseConversion(string s) + public static bool ParticipatesInCaseConversion(ReadOnlySpan s) { foreach (char c in s) { @@ -890,6 +892,7 @@ public static bool ParticipatesInCaseConversion(string s) } /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. + /// This may enumerate negated characters if the set is negated. private static bool CanEasilyEnumerateSetContents(string set) => set.Length > SetStartIndex && set[SetLengthIndex] > 0 && @@ -1013,61 +1016,69 @@ public static bool IsWordChar(char ch) } } - public static bool CharInClass(char ch, string set, ref int[]? asciiResultCache) + /// Determines a character's membership in a character class (via the string representation of the class). + /// The character. + /// The string representation of the character class. + /// A lazily-populated cache for ASCII results stored in a 256-bit array. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache) { - // The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit ("known") in the pair - // says whether the second bit ("value") in the pair has already been computed. Once a value is computed, it's never + // The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit in the pair + // says whether the second bit in the pair has already been computed. Once a value is computed, it's never // changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit // has been set. If the known bit hasn't been set, then we proceed to look it up, and then swap in the result. const int CacheArrayLength = 8; - Debug.Assert(asciiResultCache is null || asciiResultCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters"); + Debug.Assert(asciiLazyCache is null || asciiLazyCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters"); - if (ch < 128) + // If the value is ASCII and already has an answer for this value, use it. + if (asciiLazyCache is uint[] cache) { - // Lazily-initialize the cache for this set. - if (asciiResultCache is null) + int index = ch >> 4; + if ((uint)index < (uint)cache.Length) { - Interlocked.CompareExchange(ref asciiResultCache, new int[CacheArrayLength], null); + Debug.Assert(ch < 128); + uint current = cache[index]; + uint bit = 1u << ((ch & 0xF) << 1); + if ((current & bit) != 0) + { + return (current & (bit << 1)) != 0; + } } + } - // Determine which int in the lookup array contains the known and value bits for this character, - // and compute their bit numbers. - ref int slot = ref asciiResultCache[ch >> 4]; - int knownBit = 1 << ((ch & 0xF) << 1); - int valueBit = knownBit << 1; - - // If the value for this bit has already been computed, use it. - int current = slot; - if ((current & knownBit) != 0) - { - return (current & valueBit) != 0; - } + // For ASCII, lazily initialize. For non-ASCII, just compute the value. + return ch < 128 ? + InitializeValue(ch, set, ref asciiLazyCache) : + CharInClassRecursive(ch, set, 0); + static bool InitializeValue(char ch, string set, ref uint[]? asciiLazyCache) + { // (After warm-up, we should find ourselves rarely getting here.) + Debug.Assert(ch < 128); - // Otherwise, compute it normally. + // Compute the result and determine which bits to write back to the array and "or" the bits back in a thread-safe manner. bool isInClass = CharInClass(ch, set); - - // Determine which bits to write back to the array and "or" the bits back in a thread-safe manner. - int bitsToSet = knownBit; + uint bitsToSet = 1u << ((ch & 0xF) << 1); if (isInClass) { - bitsToSet |= valueBit; + bitsToSet |= bitsToSet << 1; } + + uint[]? cache = asciiLazyCache ?? Interlocked.CompareExchange(ref asciiLazyCache, new uint[CacheArrayLength], null) ?? asciiLazyCache; #if REGEXGENERATOR - InterlockedExtensions.Or(ref slot, bitsToSet); + InterlockedExtensions.Or(ref cache[ch >> 4], bitsToSet); #else - Interlocked.Or(ref slot, bitsToSet); + Interlocked.Or(ref cache[ch >> 4], bitsToSet); #endif // Return the computed value. return isInClass; } - - // Non-ASCII. Fall back to computing the answer. - return CharInClassRecursive(ch, set, 0); } + /// + /// Determines a character's membership in a character class (via the string representation of the class). + /// public static bool CharInClass(char ch, string set) => CharInClassRecursive(ch, set, 0); @@ -1279,6 +1290,283 @@ private static RegexCharClass ParseRecursive(string charClass, int start) return new RegexCharClass(IsNegated(charClass, start), ranges, categoriesBuilder, sub); } + #region Perf workaround until https://github.com/dotnet/runtime/issues/61048 and https://github.com/dotnet/runtime/issues/59492 are addressed + // TODO: https://github.com/dotnet/runtime/issues/61048 + // The below functionality needs to be removed/replaced/generalized. The goal is to avoid relying on + // ToLower and culture-based operation at match time, and instead be able to compute at construction + // time case folding equivalence classes that let us determine up-front the set of characters considered + // valid for a match. For now, we do this just for ASCII, and for anything else fall back to the + // pre-existing mechanism whereby a culture is used at construction time to ToLower and then one is + // used at match time to ToLower. We also skip 'i' and 'I', as the casing of those varies across culture + // whereas every other ASCII value's casing is stable across culture. We could hardcode the values for + // when an invariant vs tr/az culture vs any other culture is used, and we likely will, but for now doing + // so would be a breaking change, as in doing so we'd be relying only on the culture present at the time + // of construction rather than the one at the time of match. That will be resolved with + // https://github.com/dotnet/runtime/issues/59492. + + /// Creates a set string for a single character, optionally factoring in case-insensitivity. + /// The character for which to create the set. + /// null if case-sensitive; non-null if case-insensitive, in which case it's the culture to use. + /// false if the caller should strip out RegexOptions.IgnoreCase because it's now fully represented by the set; otherwise, true. + /// The create set string. + public static string OneToStringClass(char c, CultureInfo? caseInsensitive, out bool resultIsCaseInsensitive) + { + var vsb = new ValueStringBuilder(stackalloc char[4]); + + if (caseInsensitive is null) + { + resultIsCaseInsensitive = false; + vsb.Append(c); + } + else if (c < 128 && (c | 0x20) != 'i') + { + resultIsCaseInsensitive = false; + switch (c) + { + // These are the same in all cultures. As with the rest of this support, we can generalize this + // once we fix the aforementioned casing issues, e.g. by lazily populating an interning cache + // rather than hardcoding the strings for these values, once almost all values will be the same + // regardless of culture. + case 'A': case 'a': return "\0\x0004\0ABab"; + case 'B': case 'b': return "\0\x0004\0BCbc"; + case 'C': case 'c': return "\0\x0004\0CDcd"; + case 'D': case 'd': return "\0\x0004\0DEde"; + case 'E': case 'e': return "\0\x0004\0EFef"; + case 'F': case 'f': return "\0\x0004\0FGfg"; + case 'G': case 'g': return "\0\x0004\0GHgh"; + case 'H': case 'h': return "\0\x0004\0HIhi"; + // allow 'i' to fall through + case 'J': case 'j': return "\0\x0004\0JKjk"; + case 'K': case 'k': return "\0\x0006\0KLkl\u212A\u212B"; + case 'L': case 'l': return "\0\x0004\0LMlm"; + case 'M': case 'm': return "\0\x0004\0MNmn"; + case 'N': case 'n': return "\0\x0004\0NOno"; + case 'O': case 'o': return "\0\x0004\0OPop"; + case 'P': case 'p': return "\0\x0004\0PQpq"; + case 'Q': case 'q': return "\0\x0004\0QRqr"; + case 'R': case 'r': return "\0\x0004\0RSrs"; + case 'S': case 's': return "\0\x0004\0STst"; + case 'T': case 't': return "\0\x0004\0TUtu"; + case 'U': case 'u': return "\0\x0004\0UVuv"; + case 'V': case 'v': return "\0\x0004\0VWvw"; + case 'W': case 'w': return "\0\x0004\0WXwx"; + case 'X': case 'x': return "\0\x0004\0XYxy"; + case 'Y': case 'y': return "\0\x0004\0YZyz"; + case 'Z': case 'z': return "\0\x0004\0Z[z{"; + + // All the ASCII !ParticipatesInCaseConversion + case '\u0000': return "\0\u0002\0\u0000\u0001"; + case '\u0001': return "\0\u0002\0\u0001\u0002"; + case '\u0002': return "\0\u0002\0\u0002\u0003"; + case '\u0003': return "\0\u0002\0\u0003\u0004"; + case '\u0004': return "\0\u0002\0\u0004\u0005"; + case '\u0005': return "\0\u0002\0\u0005\u0006"; + case '\u0006': return "\0\u0002\0\u0006\u0007"; + case '\u0007': return "\0\u0002\0\u0007\u0008"; + case '\u0008': return "\0\u0002\0\u0008\u0009"; + case '\u0009': return "\0\u0002\0\u0009\u000A"; + case '\u000A': return "\0\u0002\0\u000A\u000B"; + case '\u000B': return "\0\u0002\0\u000B\u000C"; + case '\u000C': return "\0\u0002\0\u000C\u000D"; + case '\u000D': return "\0\u0002\0\u000D\u000E"; + case '\u000E': return "\0\u0002\0\u000E\u000F"; + case '\u000F': return "\0\u0002\0\u000F\u0010"; + case '\u0010': return "\0\u0002\0\u0010\u0011"; + case '\u0011': return "\0\u0002\0\u0011\u0012"; + case '\u0012': return "\0\u0002\0\u0012\u0013"; + case '\u0013': return "\0\u0002\0\u0013\u0014"; + case '\u0014': return "\0\u0002\0\u0014\u0015"; + case '\u0015': return "\0\u0002\0\u0015\u0016"; + case '\u0016': return "\0\u0002\0\u0016\u0017"; + case '\u0017': return "\0\u0002\0\u0017\u0018"; + case '\u0018': return "\0\u0002\0\u0018\u0019"; + case '\u0019': return "\0\u0002\0\u0019\u001A"; + case '\u001A': return "\0\u0002\0\u001A\u001B"; + case '\u001B': return "\0\u0002\0\u001B\u001C"; + case '\u001C': return "\0\u0002\0\u001C\u001D"; + case '\u001D': return "\0\u0002\0\u001D\u001E"; + case '\u001E': return "\0\u0002\0\u001E\u001F"; + case '\u001F': return "\0\u0002\0\u001F\u0020"; + case '\u0020': return "\0\u0002\0\u0020\u0021"; + case '\u0021': return "\0\u0002\0\u0021\u0022"; + case '\u0022': return "\0\u0002\0\u0022\u0023"; + case '\u0023': return "\0\u0002\0\u0023\u0024"; + case '\u0025': return "\0\u0002\0\u0025\u0026"; + case '\u0026': return "\0\u0002\0\u0026\u0027"; + case '\u0027': return "\0\u0002\0\u0027\u0028"; + case '\u0028': return "\0\u0002\0\u0028\u0029"; + case '\u0029': return "\0\u0002\0\u0029\u002A"; + case '\u002A': return "\0\u0002\0\u002A\u002B"; + case '\u002C': return "\0\u0002\0\u002C\u002D"; + case '\u002D': return "\0\u0002\0\u002D\u002E"; + case '\u002E': return "\0\u0002\0\u002E\u002F"; + case '\u002F': return "\0\u0002\0\u002F\u0030"; + case '\u0030': return "\0\u0002\0\u0030\u0031"; + case '\u0031': return "\0\u0002\0\u0031\u0032"; + case '\u0032': return "\0\u0002\0\u0032\u0033"; + case '\u0033': return "\0\u0002\0\u0033\u0034"; + case '\u0034': return "\0\u0002\0\u0034\u0035"; + case '\u0035': return "\0\u0002\0\u0035\u0036"; + case '\u0036': return "\0\u0002\0\u0036\u0037"; + case '\u0037': return "\0\u0002\0\u0037\u0038"; + case '\u0038': return "\0\u0002\0\u0038\u0039"; + case '\u0039': return "\0\u0002\0\u0039\u003A"; + case '\u003A': return "\0\u0002\0\u003A\u003B"; + case '\u003B': return "\0\u0002\0\u003B\u003C"; + case '\u003F': return "\0\u0002\0\u003F\u0040"; + case '\u0040': return "\0\u0002\0\u0040\u0041"; + case '\u005B': return "\0\u0002\0\u005B\u005C"; + case '\u005C': return "\0\u0002\0\u005C\u005D"; + case '\u005D': return "\0\u0002\0\u005D\u005E"; + case '\u005F': return "\0\u0002\0\u005F\u0060"; + case '\u007B': return "\0\u0002\0\u007B\u007C"; + case '\u007D': return "\0\u0002\0\u007D\u007E"; + case '\u007F': return "\0\u0002\0\u007F\u0080"; + } + AddAsciiCharIgnoreCaseEquivalence(c, ref vsb, caseInsensitive); + } + else if (!ParticipatesInCaseConversion(c)) + { + resultIsCaseInsensitive = false; + vsb.Append(c); + } + else + { + resultIsCaseInsensitive = true; + vsb.Append(char.ToLower(c, caseInsensitive)); + } + + string result = CharsToStringClass(vsb.AsSpan()); + vsb.Dispose(); + return result; + } + + private static unsafe string CharsToStringClass(ReadOnlySpan chars) + { +#if DEBUG + // Make sure they're all sorted with no duplicates + for (int index = 0; index < chars.Length - 1; index++) + { + Debug.Assert(chars[index] < chars[index + 1]); + } +#endif + + // If there aren't any chars, just return an empty class. + if (chars.Length == 0) + { + return EmptyClass; + } + + // Count how many characters there actually are. All but the very last possible + // char value will have two characters, one for the inclusive beginning of range + // and one for the exclusive end of range. + int count = chars.Length * 2; + if (chars[chars.Length - 1] == LastChar) + { + count--; + } + + // Get the pointer/length of the span to be able to pass it into string.Create. + fixed (char* charsPtr = chars) + { +#if REGEXGENERATOR + return StringExtensions.Create( +#else + return string.Create( +#endif + SetStartIndex + count, ((IntPtr)charsPtr, chars.Length), static (span, state) => + { + // Reconstruct the span now that we're inside of the lambda. + ReadOnlySpan chars = new ReadOnlySpan((char*)state.Item1, state.Length); + + // Fill in the set string + span[FlagsIndex] = (char)0; + span[CategoryLengthIndex] = (char)0; + span[SetLengthIndex] = (char)(span.Length - SetStartIndex); + int i = SetStartIndex; + foreach (char c in chars) + { + span[i++] = c; + if (c != LastChar) + { + span[i++] = (char)(c + 1); + } + } + Debug.Assert(i == span.Length); + }); + } + } + + /// Tries to create from a RegexOptions.IgnoreCase set string a new set string that can be used without RegexOptions.IgnoreCase. + /// The original set string from a RegexOptions.IgnoreCase node. + /// The culture in use. + /// A new set string if one could be created. + public static string? MakeCaseSensitiveIfPossible(string set, CultureInfo culture) + { + if (IsNegated(set)) + { + return null; + } + + // We'll eventually need a more robust way to do this for any set. For now, we iterate through each character + // in the set, and to avoid spending lots of time doing so, we limit the number of characters. This approach also + // limits the structure of the sets allowed, e.g. they can't be negated, can't use subtraction, etc. + Span setChars = stackalloc char[64]; // arbitary limit chosen to include common groupings like all ASCII letters and digits + + // Try to get the set's characters. + int setCharsCount = GetSetChars(set, setChars); + if (setCharsCount == 0) + { + return null; + } + + // Enumerate all the characters and add all characters that form their case folding equivalence class. + var rcc = new RegexCharClass(); + var vsb = new ValueStringBuilder(stackalloc char[4]); + foreach (char c in setChars.Slice(0, setCharsCount)) + { + if (c >= 128 || c == 'i' || c == 'I') + { + return null; + } + + vsb.Length = 0; + AddAsciiCharIgnoreCaseEquivalence(c, ref vsb, culture); + foreach (char v in vsb.AsSpan()) + { + rcc.AddChar(v); + } + } + + // Return the constructed class. + return rcc.ToStringClass(); + } + + private static void AddAsciiCharIgnoreCaseEquivalence(char c, ref ValueStringBuilder vsb, CultureInfo culture) + { + Debug.Assert(c < 128, $"Expected ASCII, got {(int)c}"); + Debug.Assert(c != 'i' && c != 'I', "'i' currently doesn't work correctly in all cultures"); + + char upper = char.ToUpper(c, culture); + char lower = char.ToLower(c, culture); + + if (upper < lower) + { + vsb.Append(upper); + } + vsb.Append(lower); + if (upper > lower) + { + vsb.Append(upper); + } + + if (c == 'k' || c == 'K') + { + vsb.Append((char)0x212A); // kelvin sign + } + } + #endregion + /// /// Constructs the string representation of the class. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index d8700ebb9bd28..6f6c8cd8f8852 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -16,6 +16,7 @@ using System.Collections; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Globalization; namespace System.Text.RegularExpressions { @@ -96,35 +97,25 @@ internal sealed class RegexCode public readonly RegexTree Tree; // the optimized parse tree public readonly int[] Codes; // the code public readonly string[] Strings; // the string/set table - public readonly int[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings + public readonly uint[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings public readonly int TrackCount; // how many instructions use backtracking public readonly Hashtable? Caps; // mapping of user group numbers -> impl group slots public readonly int CapSize; // number of impl group slots - public readonly (string CharClass, bool CaseInsensitive)[]? LeadingCharClasses; // the set of candidate first characters, if available. Each entry corresponds to the next char in the input. - public int[]? LeadingCharClassAsciiLookup; // the ASCII lookup table optimization for LeadingCharClasses[0], if it exists; only used by the interpreter - public readonly RegexBoyerMoore? BoyerMoorePrefix; // the fixed prefix string as a Boyer-Moore machine, if available - public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc) public readonly bool RightToLeft; // true if right to left + public readonly RegexFindOptimizations FindOptimizations; - public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount, - Hashtable? caps, int capsize, - RegexBoyerMoore? boyerMoorePrefix, - (string CharClass, bool CaseInsensitive)[]? leadingCharClasses, - int leadingAnchor, bool rightToLeft) + public RegexCode(RegexTree tree, CultureInfo culture, int[] codes, string[] strings, int trackcount, + Hashtable? caps, int capsize) { - Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null); - Tree = tree; Codes = codes; Strings = strings; - StringsAsciiLookup = new int[strings.Length][]; + StringsAsciiLookup = new uint[strings.Length][]; TrackCount = trackcount; Caps = caps; CapSize = capsize; - BoyerMoorePrefix = boyerMoorePrefix; - LeadingCharClasses = leadingCharClasses; - LeadingAnchor = leadingAnchor; - RightToLeft = rightToLeft; + RightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0; + FindOptimizations = new RegexFindOptimizations(tree, culture); } public static bool OpcodeBacktracks(int Op) @@ -409,26 +400,8 @@ public override string ToString() var sb = new StringBuilder(); sb.AppendLine($"Direction: {(RightToLeft ? "right-to-left" : "left-to-right")}"); - sb.AppendLine($"Anchor: {RegexPrefixAnalyzer.AnchorDescription(LeadingAnchor)}"); + sb.AppendLine($"Anchor: {RegexPrefixAnalyzer.AnchorDescription(FindOptimizations.LeadingAnchor)}"); sb.AppendLine(); - - if (BoyerMoorePrefix != null) - { - sb.AppendLine("Boyer-Moore:"); - sb.AppendLine(BoyerMoorePrefix.Dump(" ")); - sb.AppendLine(); - } - - if (LeadingCharClasses != null) - { - sb.AppendLine("First Chars:"); - for (int i = 0; i < LeadingCharClasses.Length; i++) - { - sb.AppendLine($"{i}: {RegexCharClass.SetDescription(LeadingCharClasses[i].CharClass)}"); - } - sb.AppendLine(); - } - for (int i = 0; i < Codes.Length; i += OpcodeSize(Codes[i])) { sb.AppendLine(OpcodeDescription(i)); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index e288567bf14a7..cfafa2656677d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -62,6 +62,9 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfChar = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -90,9 +93,6 @@ internal abstract class RegexCompiler protected RegexCode? _code; // the RegexCode object protected int[]? _codes; // the RegexCodes being translated protected string[]? _strings; // the stringtable associated with the RegexCodes - protected (string CharClass, bool CaseInsensitive)[]? _leadingCharClasses; // the possible first chars computed by RegexPrefixAnalyzer - protected RegexBoyerMoore? _boyerMoorePrefix; // a prefix as a boyer-moore machine - protected int _leadingAnchor; // the set of anchors protected bool _hasTimeout; // whether the regex has a non-infinite timeout private Label[]? _labels; // a label for every operation in _codes @@ -928,20 +928,20 @@ protected void GenerateFindFirstChar() } _runtextLocal = DeclareString(); _textInfoLocal = null; - if (!_options.HasFlag(RegexOptions.CultureInvariant)) + if ((_options & RegexOptions.CultureInvariant) == 0) { - bool needsCulture = _options.HasFlag(RegexOptions.IgnoreCase) || _boyerMoorePrefix?.CaseInsensitive == true; - if (!needsCulture && _leadingCharClasses != null) + bool needsCulture = _code.FindOptimizations.FindMode switch { - for (int i = 0; i < _leadingCharClasses.Length; i++) - { - if (_leadingCharClasses[i].CaseInsensitive) - { - needsCulture = true; - break; - } - } - } + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive or + FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or + FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive => true, + + _ when _code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), + + _ => false, + }; if (needsCulture) { @@ -1012,43 +1012,66 @@ protected void GenerateFindFirstChar() Ret(); MarkLabel(finishedLengthCheck); - GenerateAnchorChecks(); + // Some anchors help to advance the position but don't terminate the operation. + // As such, we do the anchors check first, and then treat them below the same + // as if there's no special searching enabled. + GenerateAnchors(); - if (_boyerMoorePrefix is RegexBoyerMoore { NegativeUnicode: null } rbm) - { - if (rbm.PatternSupportsIndexOf) - { - GenerateIndexOf(rbm.Pattern); - } - else - { - GenerateBoyerMoore(rbm); - } - } - else if (_leadingCharClasses is not null) - { - if (_code.RightToLeft) - { - GenerateLeadingCharacter_RightToLeft(); - } - else - { - GenerateLeadingCharacter_LeftToRight(); - } - } - else + switch (_code.FindOptimizations.FindMode) { - // return true; - Ldc(1); - Ret(); + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); + GenerateIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); + GenerateIndexOf_RightToLeft(_code.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + GenerateFixedSet_LeftToRight(); + break; + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + GenerateFixedSet_RightToLeft(); + break; + + // Already emitted earlier + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + Debug.Assert(_code.FindOptimizations.LeadingAnchor != 0); + goto case FindNextStartingPositionMode.NoSearch; + + default: + Debug.Fail($"Unexpected mode: {_code.FindOptimizations.FindMode}"); + goto case FindNextStartingPositionMode.NoSearch; + + case FindNextStartingPositionMode.NoSearch: + // return true; + Ldc(1); + Ret(); + break; } - void GenerateAnchorChecks() + void GenerateAnchors() { // Generate anchor checks. - if ((_leadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) + if ((_code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) { - switch (_leadingAnchor) + switch (_code.FindOptimizations.LeadingAnchor) { case RegexPrefixAnalyzer.Beginning: { @@ -1159,14 +1182,14 @@ void GenerateAnchorChecks() Ret(); return; - case RegexPrefixAnalyzer.Bol when !_code.RightToLeft: // don't bother optimizing for the niche case of RegexOptions.RightToLeft | RegexOptions.Multiline + case RegexPrefixAnalyzer.Bol: { // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any Boyer-Moore or - // leading char class searches. + // to boost our position to the next line, and then continue normally with any prefix or char class searches. + Debug.Assert(!_code.RightToLeft, "RightToLeft isn't implemented and should have been filtered out previously"); Label atBeginningOfLine = DefineLabel(); // if (runtextpos > runtextbeg... @@ -1220,216 +1243,7 @@ void GenerateAnchorChecks() } } - void GenerateBoyerMoore(RegexBoyerMoore rbm) - { - LocalBuilder limitLocal; - int beforefirst; - int last; - if (!_code.RightToLeft) - { - limitLocal = _runtextendLocal; - beforefirst = -1; - last = rbm.Pattern.Length - 1; - } - else - { - limitLocal = _runtextbegLocal!; - beforefirst = rbm.Pattern.Length; - last = 0; - } - - int chLast = rbm.Pattern[last]; - - // string runtext = this.runtext; - Mvfldloc(s_runtextField, _runtextLocal); - - // runtextpos += pattern.Length - 1; // advance to match last character - Ldloc(_runtextposLocal); - if (!_code.RightToLeft) - { - Ldc(rbm.Pattern.Length - 1); - Add(); - } - else - { - Ldc(rbm.Pattern.Length); - Sub(); - } - Stloc(_runtextposLocal); - - Label lStart = DefineLabel(); - Br(lStart); - - // DefaultAdvance: - // offset = pattern.Length; - Label lDefaultAdvance = DefineLabel(); - MarkLabel(lDefaultAdvance); - Ldc(_code.RightToLeft ? -rbm.Pattern.Length : rbm.Pattern.Length); - - // Advance: - // runtextpos += offset; - Label lAdvance = DefineLabel(); - MarkLabel(lAdvance); - Ldloc(_runtextposLocal); - Add(); - Stloc(_runtextposLocal); - - // Start: - // if (runtextpos >= runtextend) goto returnFalse; - MarkLabel(lStart); - Ldloc(_runtextposLocal); - Ldloc(limitLocal); - if (!_code.RightToLeft) - { - BgeFar(returnFalse); - } - else - { - BltFar(returnFalse); - } - - // ch = runtext[runtextpos]; - Rightchar(); - if (rbm.CaseInsensitive) - { - CallToLower(); - } - - Label lPartialMatch = DefineLabel(); - using (RentedLocalBuilder chLocal = RentInt32Local()) - { - Stloc(chLocal); - Ldloc(chLocal); - Ldc(chLast); - - // if (ch == lastChar) goto partialMatch; - BeqFar(lPartialMatch); - - // ch -= lowAscii; - // if (ch > (highAscii - lowAscii)) goto defaultAdvance; - Ldloc(chLocal); - Ldc(rbm.LowASCII); - Sub(); - Stloc(chLocal); - Ldloc(chLocal); - Ldc(rbm.HighASCII - rbm.LowASCII); - BgtUn(lDefaultAdvance); - - // int offset = "lookupstring"[num]; - // goto advance; - int negativeRange = rbm.HighASCII - rbm.LowASCII + 1; - if (negativeRange > 1) - { - // Create a string to store the lookup table we use to find the offset. - Debug.Assert(rbm.Pattern.Length <= char.MaxValue, "RegexBoyerMoore should have limited the size allowed."); - string negativeLookup = string.Create(negativeRange, (rbm, beforefirst), static (span, state) => - { - // Store the offsets into the string. RightToLeft has negative offsets, so to support it with chars (unsigned), we negate - // the values to be stored in the string, and then at run time after looking up the offset in the string, negate it again. - for (int i = 0; i < span.Length; i++) - { - int offset = state.rbm.NegativeASCII[i + state.rbm.LowASCII]; - if (offset == state.beforefirst) - { - offset = state.rbm.Pattern.Length; - } - else if (state.rbm.RightToLeft) - { - offset = -offset; - } - Debug.Assert(offset >= 0 && offset <= char.MaxValue); - span[i] = (char)offset; - } - }); - - // offset = lookupString[ch]; - // goto Advance; - Ldstr(negativeLookup); - Ldloc(chLocal); - Call(s_stringGetCharsMethod); - if (_code.RightToLeft) - { - Neg(); - } - } - else - { - // offset = value; - Debug.Assert(negativeRange == 1); - int offset = rbm.NegativeASCII[rbm.LowASCII]; - if (offset == beforefirst) - { - offset = _code.RightToLeft ? -rbm.Pattern.Length : rbm.Pattern.Length; - } - Ldc(offset); - } - BrFar(lAdvance); - } - - // Emit a check for each character from the next to last down to the first. - MarkLabel(lPartialMatch); - Ldloc(_runtextposLocal); - using (RentedLocalBuilder testLocal = RentInt32Local()) - { - Stloc(testLocal); - - int prevLabelOffset = int.MaxValue; - Label prevLabel = default; - for (int i = rbm.Pattern.Length - 2; i >= 0; i--) - { - int charindex = _code.RightToLeft ? rbm.Pattern.Length - 1 - i : i; - - // if (runtext[--test] == pattern[index]) goto lNext; - Ldloc(_runtextLocal); - Ldloc(testLocal); - Ldc(1); - Sub(_code.RightToLeft); - Stloc(testLocal); - Ldloc(testLocal); - Call(s_stringGetCharsMethod); - if (rbm.CaseInsensitive && RegexCharClass.ParticipatesInCaseConversion(rbm.Pattern[charindex])) - { - CallToLower(); - } - Ldc(rbm.Pattern[charindex]); - - if (prevLabelOffset == rbm.Positive[charindex]) - { - BneFar(prevLabel); - } - else - { - Label lNext = DefineLabel(); - Beq(lNext); - - // offset = positive[ch]; - // goto advance; - prevLabel = DefineLabel(); - prevLabelOffset = rbm.Positive[charindex]; - MarkLabel(prevLabel); - Ldc(prevLabelOffset); - BrFar(lAdvance); - - MarkLabel(lNext); - } - } - - // this.runtextpos = test; - // return true; - Ldthis(); - Ldloc(testLocal); - if (_code.RightToLeft) - { - Ldc(1); - Add(); - } - Stfld(s_runtextposField); - Ldc(1); - Ret(); - } - } - - void GenerateIndexOf(string prefix) + void GenerateIndexOf_LeftToRight(string prefix) { using RentedLocalBuilder i = RentInt32Local(); @@ -1446,11 +1260,7 @@ void GenerateIndexOf(string prefix) Call(s_spanIndexOfSpan); Stloc(i); - // if (i < 0) - // { - // base.runtextpos = runtextend; - // return false; - // } + // if (i < 0) goto ReturnFalse; Ldloc(i); Ldc(0); BltFar(returnFalse); @@ -1466,105 +1276,135 @@ void GenerateIndexOf(string prefix) Ret(); } - void GenerateLeadingCharacter_RightToLeft() + void GenerateIndexOf_RightToLeft(string prefix) { - Debug.Assert(_leadingCharClasses.Length == 1, "Only the FirstChars and not MultiFirstChars computation is supported for RightToLeft"); - - using RentedLocalBuilder cLocal = RentInt32Local(); - - Label l1 = DefineLabel(); - Label l2 = DefineLabel(); - Label l3 = DefineLabel(); - Label l4 = DefineLabel(); - Label l5 = DefineLabel(); - - Mvfldloc(s_runtextField, _runtextLocal); + using RentedLocalBuilder i = RentInt32Local(); + // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(prefix); + Ldthis(); + Ldfld(s_runtextField); + Ldloc(_runtextbegLocal!); Ldloc(_runtextposLocal); Ldloc(_runtextbegLocal!); Sub(); - Stloc(cLocal); + Call(s_stringAsSpanIntIntMethod); + Ldstr(prefix); + Call(s_stringAsSpanMethod); + Call(s_spanLastIndexOfSpan); + Stloc(i); - if (minRequiredLength == 0) // if minRequiredLength > 0, we already output a more stringent check - { - Ldloc(cLocal); - Ldc(0); - BleFar(l4); - } + // if (i < 0) goto ReturnFalse; + Ldloc(i); + Ldc(0); + BltFar(returnFalse); - MarkLabel(l1); - Ldloc(cLocal); + // base.runtextpos = runtextbeg + i + LeadingCaseSensitivePrefix.Length; + // return true; + Ldthis(); + Ldloc(_runtextbegLocal!); + Ldloc(i); + Add(); + Ldc(prefix.Length); + Add(); + Stfld(s_runtextposField); Ldc(1); - Sub(); - Stloc(cLocal); + Ret(); + } - Leftcharnext(); + void GenerateFixedSet_RightToLeft() + { + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = _code.FindOptimizations.FixedDistanceSets![0]; + Debug.Assert(set.Distance == 0); - if (!RegexCharClass.IsSingleton(_leadingCharClasses[0].CharClass)) - { - EmitMatchCharacterClass(_leadingCharClasses[0].CharClass, _leadingCharClasses[0].CaseInsensitive); - Brtrue(l2); - } - else - { - Ldc(RegexCharClass.SingletonChar(_leadingCharClasses[0].CharClass)); - Beq(l2); - } + using RentedLocalBuilder i = RentInt32Local(); - MarkLabel(l5); + Mvfldloc(s_runtextField, _runtextLocal); - Ldloc(cLocal); - Ldc(0); - if (!RegexCharClass.IsSingleton(_leadingCharClasses[0].CharClass)) + if (set.Chars is { Length: 1 } && !set.CaseInsensitive) { - BgtFar(l1); + // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(set.Chars[0]); + Ldthis(); + Ldfld(s_runtextField); + Ldloc(_runtextbegLocal!); + Ldloc(_runtextposLocal); + Ldloc(_runtextbegLocal!); + Sub(); + Call(s_stringAsSpanIntIntMethod); + Ldc(set.Chars[0]); + Call(s_spanLastIndexOfChar); + Stloc(i); + + // if (i < 0) goto ReturnFalse; + Ldloc(i); + Ldc(0); + BltFar(returnFalse); + + // base.runtextpos = runtextbeg + i + 1; + // return true; + Ldthis(); + Ldloc(_runtextbegLocal!); + Ldloc(i); + Add(); + Ldc(1); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); } else { - Bgt(l1); - } - - Ldc(0); - Br(l3); + Label condition = DefineLabel(); + Label increment = DefineLabel(); + Label body = DefineLabel(); - MarkLabel(l2); - - Ldloc(_runtextposLocal); - Ldc(1); - Sub(_code.RightToLeft); - Stloc(_runtextposLocal); - Ldc(1); + // for (int i = runtextpos - 1; ... + Ldloc(_runtextposLocal); + Ldc(1); + Sub(); + Stloc(i); + Br(condition); + + // if (MatchCharClass(runtext[i], set)) + MarkLabel(body); + Ldloc(_runtextLocal); + Ldloc(i); + Call(s_stringGetCharsMethod); + EmitMatchCharacterClass(set.Set, set.CaseInsensitive); + Brfalse(increment); + + // base.runtextpos = i + 1; + // return true; + Ldthis(); + Ldloc(i); + Ldc(1); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); - MarkLabel(l3); + // for (...; ...; i--) + MarkLabel(increment); + Ldloc(i); + Ldc(1); + Sub(); + Stloc(i); - Mvlocfld(_runtextposLocal, s_runtextposField); - Ret(); + // for (...; i >= runtextbeg; ...) + MarkLabel(condition); + Ldloc(i); + Ldloc(_runtextbegLocal!); + Bge(body); + } - MarkLabel(l4); - Ldc(0); - Ret(); + BrFar(returnFalse); } - void GenerateLeadingCharacter_LeftToRight() + void GenerateFixedSet_LeftToRight() { - Debug.Assert(_leadingCharClasses != null && _leadingCharClasses.Length > 0); - - // If minRequiredLength > 0, we already output a more stringent check. In the rare case - // where we were unable to get an accurate enough min required length to ensure it's larger - // than the prefixes we calculated, we also need to ensure we have enough spaces for those, - // as they also represent a min required length. - if (minRequiredLength < _leadingCharClasses.Length) - { - // if (runtextpos >= runtextend - (_leadingCharClasses.Length - 1)) goto returnFalse; - Ldloc(_runtextendLocal); - if (_leadingCharClasses.Length > 1) - { - Ldc(_leadingCharClasses.Length - 1); - Sub(); - } - Ldloc(_runtextposLocal); - BleFar(returnFalse); - } + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _code.FindOptimizations.FixedDistanceSets; + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; + const int MaxSets = 4; + int setsToUse = Math.Min(sets.Count, MaxSets); using RentedLocalBuilder iLocal = RentInt32Local(); using RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal(); @@ -1580,13 +1420,9 @@ void GenerateLeadingCharacter_LeftToRight() // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. - Span setChars = stackalloc char[3]; // up to 3 characters handled by IndexOf{Any} below - int setCharsCount = 0, charClassIndex = 0; - bool canUseIndexOf = - !_leadingCharClasses[0].CaseInsensitive && - (setCharsCount = RegexCharClass.GetSetChars(_leadingCharClasses[0].CharClass, setChars)) > 0 && - !RegexCharClass.IsNegated(_leadingCharClasses[0].CharClass); - bool needLoop = !canUseIndexOf || _leadingCharClasses.Length > 1; + int setIndex = 0; + bool canUseIndexOf = !primarySet.CaseInsensitive && primarySet.Chars is not null; + bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; Label charNotInClassLabel = default; @@ -1606,13 +1442,25 @@ void GenerateLeadingCharacter_LeftToRight() if (canUseIndexOf) { - charClassIndex = 1; + setIndex = 1; if (needLoop) { - // textSpan.Slice(iLocal) + // textSpan.Slice(iLocal + primarySet.Distance); Ldloca(textSpanLocal); Ldloc(iLocal); + if (primarySet.Distance != 0) + { + Ldc(primarySet.Distance); + Add(); + } + Call(s_spanSliceIntMethod); + } + else if (primarySet.Distance != 0) + { + // textSpan.Slice(primarySet.Distance) + Ldloca(textSpanLocal); + Ldc(primarySet.Distance); Call(s_spanSliceIntMethod); } else @@ -1621,29 +1469,34 @@ void GenerateLeadingCharacter_LeftToRight() Ldloc(textSpanLocal); } - switch (setCharsCount) + switch (primarySet.Chars!.Length) { case 1: // tmp = ...IndexOf(setChars[0]); - Ldc(setChars[0]); + Ldc(primarySet.Chars[0]); Call(s_spanIndexOfChar); break; case 2: // tmp = ...IndexOfAny(setChars[0], setChars[1]); - Ldc(setChars[0]); - Ldc(setChars[1]); + Ldc(primarySet.Chars[0]); + Ldc(primarySet.Chars[1]); Call(s_spanIndexOfAnyCharChar); break; - default: // 3 + case 3: // tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]}); - Debug.Assert(setCharsCount == 3); - Ldc(setChars[0]); - Ldc(setChars[1]); - Ldc(setChars[2]); + Ldc(primarySet.Chars[0]); + Ldc(primarySet.Chars[1]); + Ldc(primarySet.Chars[2]); Call(s_spanIndexOfAnyCharCharChar); break; + + default: + Ldstr(new string(primarySet.Chars)); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfAnySpan); + break; } if (needLoop) @@ -1672,13 +1525,13 @@ void GenerateLeadingCharacter_LeftToRight() BltFar(returnFalse); } - // if (i >= textSpan.Length - (_leadingCharClasses.Length - 1)) goto returnFalse; - if (_leadingCharClasses.Length > 1) + // if (i >= textSpan.Length - (minRequiredLength - 1)) goto returnFalse; + if (sets.Count > 1) { Debug.Assert(needLoop); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - Ldc(_leadingCharClasses.Length - 1); + Ldc(minRequiredLength - 1); Sub(); Ldloc(iLocal); BleFar(returnFalse); @@ -1689,20 +1542,20 @@ void GenerateLeadingCharacter_LeftToRight() // if (!CharInClass(textSpan[i + 1], prefix[1], "...")) continue; // if (!CharInClass(textSpan[i + 2], prefix[2], "...")) continue; // ... - Debug.Assert(charClassIndex == 0 || charClassIndex == 1); - for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++) + Debug.Assert(setIndex == 0 || setIndex == 1); + for ( ; setIndex < sets.Count; setIndex++) { Debug.Assert(needLoop); Ldloca(textSpanLocal); Ldloc(iLocal); - if (charClassIndex > 0) + if (sets[setIndex].Distance != 0) { - Ldc(charClassIndex); + Ldc(sets[setIndex].Distance); Add(); } Call(s_spanGetItemMethod); LdindU2(); - EmitMatchCharacterClass(_leadingCharClasses[charClassIndex].CharClass, _leadingCharClasses[charClassIndex].CaseInsensitive); + EmitMatchCharacterClass(sets[setIndex].Set, sets[setIndex].CaseInsensitive); BrfalseFar(charNotInClassLabel); } @@ -1726,14 +1579,14 @@ void GenerateLeadingCharacter_LeftToRight() Add(); Stloc(iLocal); - // for (...; i < span.Length - (_leadingCharClasses.Length - 1); ...); + // for (...; i < span.Length - (minRequiredLength - 1); ...); MarkLabel(checkSpanLengthLabel); Ldloc(iLocal); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - if (_leadingCharClasses.Length > 1) + if (setsToUse > 1) { - Ldc(_leadingCharClasses.Length - 1); + Ldc(minRequiredLength - 1); Sub(); } BltFar(loopBody); @@ -2278,23 +2131,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Concatenate: - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) - { - EmitSpanLengthCheck(requiredLength); - for (; i < exclusiveEnd; i++) - { - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); - } - - i--; - continue; - } - - EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent); - } + EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; case RegexNode.Capture: @@ -2339,6 +2176,28 @@ void EmitUpdateBumpalong() Stfld(s_runtextposField); } + // Emits code for a concatenation + void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChecksIfRequired) + { + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + { + EmitSpanLengthCheck(requiredLength); + for (; i < exclusiveEnd; i++) + { + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); + } + + i--; + continue; + } + + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent); + } + } + // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null) { @@ -2362,7 +2221,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o } else { - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + if (IsCaseInsensitive(node)) { CallToLower(); } @@ -2595,7 +2454,7 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) EmitTextSpanOffset(); textSpanPos++; LdindU2(); - if (caseInsensitive && RegexCharClass.ParticipatesInCaseConversion(s[i])) + if (caseInsensitive) { CallToLower(); } @@ -2865,12 +2724,12 @@ void EmitSingleCharAtomicLoop(RegexNode node) Label atomicLoopDoneLabel = DefineLabel(); - Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny + Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars = 0; if (node.IsNotoneFamily && maxIterations == int.MaxValue && - (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) + (!IsCaseInsensitive(node))) { // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, @@ -2911,14 +2770,15 @@ void EmitSingleCharAtomicLoop(RegexNode node) else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0 && RegexCharClass.IsNegated(node.Str!)) { - // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would + // If the set is negated and contains only a few characters (if it contained 1 and was negated, it would // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. + Debug.Assert(numSetChars > 1); - // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2{, ch3}); + // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2, ...); if (textSpanPos > 0) { Ldloca(textSpanLocal); @@ -2929,17 +2789,26 @@ void EmitSingleCharAtomicLoop(RegexNode node) { Ldloc(textSpanLocal); } - Ldc(setChars[0]); - Ldc(setChars[1]); - if (numSetChars == 2) - { - Call(s_spanIndexOfAnyCharChar); - } - else + switch (numSetChars) { - Debug.Assert(numSetChars == 3); - Ldc(setChars[2]); - Call(s_spanIndexOfAnyCharCharChar); + case 2: + Ldc(setChars[0]); + Ldc(setChars[1]); + Call(s_spanIndexOfAnyCharChar); + break; + + case 3: + Ldc(setChars[0]); + Ldc(setChars[1]); + Ldc(setChars[2]); + Call(s_spanIndexOfAnyCharCharChar); + break; + + default: + Ldstr(setChars.Slice(0, numSetChars).ToString()); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfSpan); + break; } Stloc(iterationLocal); @@ -3008,7 +2877,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) } else { - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + if (IsCaseInsensitive(node)) { CallToLower(); } @@ -3095,7 +2964,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) } else { - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + if (IsCaseInsensitive(node)) { CallToLower(); } @@ -4185,7 +4054,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4231,7 +4100,7 @@ private void GenerateOneCode() Add(); } Call(s_stringGetCharsMethod); - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i])) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4274,7 +4143,7 @@ private void GenerateOneCode() Ldc(str.Length - i); Sub(); Call(s_stringGetCharsMethod); - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(str[i])) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4477,7 +4346,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4579,14 +4448,14 @@ private void GenerateOneCode() Label loopEnd = DefineLabel(); string? set = Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic ? _strings![Operand(0)] : null; - Span setChars = stackalloc char[3]; + Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars; // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive, // we can use the vectorized IndexOf to search for the target character. if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) && !IsRightToLeft() && - (!IsCaseInsensitive() || !RegexCharClass.ParticipatesInCaseConversion(Operand(0)))) + (!IsCaseInsensitive())) { // i = runtext.AsSpan(runtextpos, len).IndexOf(ch); Ldloc(_runtextLocal!); @@ -4633,29 +4502,39 @@ private void GenerateOneCode() else if ((Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic) && !IsRightToLeft() && !IsCaseInsensitive() && - (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) > 1 && + (numSetChars = RegexCharClass.GetSetChars(set!, setChars)) != 0 && RegexCharClass.IsNegated(set!)) { // Similarly, if this is a setloop{atomic} and we're left-to-right and case-sensitive, - // and if the set contains only 2 or 3 negated chars, we can use the vectorized IndexOfAny + // and if the set contains only a few negated chars, we can use the vectorized IndexOfAny // to search for those chars. + Debug.Assert(numSetChars > 1); // i = runtext.AsSpan(runtextpos, len).IndexOfAny(ch1, ch2{, ch3}); Ldloc(_runtextLocal!); Ldloc(_runtextposLocal!); Ldloc(lenLocal); Call(s_stringAsSpanIntIntMethod); - Ldc(setChars[0]); - Ldc(setChars[1]); - if (numSetChars == 2) - { - Call(s_spanIndexOfAnyCharChar); - } - else + switch (numSetChars) { - Debug.Assert(numSetChars == 3); - Ldc(setChars[2]); - Call(s_spanIndexOfAnyCharCharChar); + case 2: + Ldc(setChars[0]); + Ldc(setChars[1]); + Call(s_spanIndexOfAnyCharChar); + break; + + case 3: + Ldc(setChars[0]); + Ldc(setChars[1]); + Ldc(setChars[2]); + Call(s_spanIndexOfAnyCharCharChar); + break; + + default: + Ldstr(setChars.Slice(0, numSetChars).ToString()); + Call(s_stringAsSpanMethod); + Call(s_spanIndexOfSpan); + break; } Stloc(iLocal); @@ -4754,7 +4633,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -4955,7 +4834,7 @@ private void GenerateOneCode() } else { - if (IsCaseInsensitive() && RegexCharClass.ParticipatesInCaseConversion(Operand(0))) + if (IsCaseInsensitive()) { CallToLower(); } @@ -5105,21 +4984,34 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive) // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), // it's cheaper and smaller to compare against each than it is to use a lookup table. - if (!invariant) + if (!invariant && !RegexCharClass.IsNegated(charClass)) { - Span setChars = stackalloc char[3]; + Span setChars = stackalloc char[4]; int numChars = RegexCharClass.GetSetChars(charClass, setChars); - if (numChars > 0 && !RegexCharClass.IsNegated(charClass)) + if (numChars is 2 or 3) { - // (ch == setChars[0]) | (ch == setChars[1]) { | (ch == setChars[2]) } - Debug.Assert(numChars == 2 || numChars == 3); - Ldloc(tempLocal); - Ldc(setChars[0]); - Ceq(); - Ldloc(tempLocal); - Ldc(setChars[1]); - Ceq(); - Or(); + if ((setChars[0] | 0x20) == setChars[1]) // special-case common case of an upper and lowercase ASCII letter combination + { + // ((ch | 0x20) == setChars[1]) + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(setChars[1]); + Ceq(); + } + else + { + // (ch == setChars[0]) | (ch == setChars[1]) + Ldloc(tempLocal); + Ldc(setChars[0]); + Ceq(); + Ldloc(tempLocal); + Ldc(setChars[1]); + Ceq(); + Or(); + } + + // | (ch == setChars[2]) if (numChars == 3) { Ldloc(tempLocal); @@ -5130,6 +5022,27 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive) return; } + else if (numChars == 4 && + (setChars[0] | 0x20) == setChars[1] && + (setChars[2] | 0x20) == setChars[3]) + { + // ((ch | 0x20) == setChars[1]) + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(setChars[1]); + Ceq(); + + // ((ch | 0x20) == setChars[3]) + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(setChars[3]); + Ceq(); + + Or(); + return; + } } using RentedLocalBuilder resultLocal = RentInt32Local(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs new file mode 100644 index 0000000000000..adab678bba7dd --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -0,0 +1,674 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; + +namespace System.Text.RegularExpressions +{ + /// Contains state and provides operations related to finding the next location a match could possibly begin. + internal sealed class RegexFindOptimizations + { + /// The minimum required length an input need be to match the pattern. May be 0. + private readonly int _minRequiredLength; + /// True if the input should be processed right-to-left rather than left-to-right. + private readonly bool _rightToLeft; + /// Provides the ToLower routine for lowercasing characters. + private readonly TextInfo _textInfo; + /// Lookup table used for optimizing ASCII when doing set queries. + private readonly uint[]?[]? _asciiLookups; + + public RegexFindOptimizations(RegexTree tree, CultureInfo culture) + { + _rightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0; + _minRequiredLength = tree.MinRequiredLength; + _textInfo = culture.TextInfo; + + // Compute any anchor starting the expression. If there is one, we won't need to search for anything, + // as we can just match at that single location. + LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree); + if (_rightToLeft) + { + // Filter out Bol for RightToLeft, as we don't currently optimize for it. + LeadingAnchor &= ~RegexPrefixAnalyzer.Bol; + } + if ((LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0) + { + FindMode = (LeadingAnchor, _rightToLeft) switch + { + (RegexPrefixAnalyzer.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning, + (RegexPrefixAnalyzer.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning, + (RegexPrefixAnalyzer.Start, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start, + (RegexPrefixAnalyzer.Start, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start, + (RegexPrefixAnalyzer.End, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End, + (RegexPrefixAnalyzer.End, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End, + (_, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ, + (_, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ, + }; + return; + } + + // If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations. + string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree); + if (caseSensitivePrefix.Length > 1) + { + LeadingCaseSensitivePrefix = caseSensitivePrefix; + FindMode = _rightToLeft ? + FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive : + FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive; + return; + } + + // At this point there are no fast-searchable anchors or case-sensitive prefixes. We can now analyze the + // pattern for sets and then use any found sets to determine what kind of search to perform. + + // If we're compiling, then the compilation process already handles sets that reduce to a single literal, + // so we can simplify and just always go for the sets. + bool dfa = (tree.Options & RegexOptions.NonBacktracking) != 0; + bool compiled = (tree.Options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled + bool interpreter = !compiled && !dfa; + + // For interpreter, we want to employ optimizations, but we don't want to make construction significantly + // more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter + // we focus only on creating a set for the first character. Same for right-to-left, which is used very + // rarely and thus we don't need to invest in special-casing it. + if (_rightToLeft) + { + // Determine a set for anything that can possibly start the expression. + if (RegexPrefixAnalyzer.FindFirstCharClass(tree, culture) is (string CharClass, bool CaseInsensitive) set) + { + // See if the set is limited to holding only a few characters. + Span scratch = stackalloc char[5]; // max optimized by IndexOfAny today + int scratchCount; + char[]? chars = null; + if (!RegexCharClass.IsNegated(set.CharClass) && + (scratchCount = RegexCharClass.GetSetChars(set.CharClass, scratch)) > 0) + { + chars = scratch.Slice(0, scratchCount).ToArray(); + } + + if (!compiled && + chars is { Length: 1 }) + { + // The set contains one and only one character, meaning every match starts + // with the same literal value (potentially case-insensitive). Search for that. + FixedDistanceLiteral = (chars[0], 0); + FindMode = (_rightToLeft, set.CaseInsensitive) switch + { + (false, false) => FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive, + (false, true) => FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive, + (true, false) => FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive, + (true, true) => FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive, + }; + } + else + { + // The set may match multiple characters. Search for that. + FixedDistanceSets = new() { (chars, set.CharClass, 0, set.CaseInsensitive) }; + FindMode = (_rightToLeft, set.CaseInsensitive) switch + { + (false, false) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive, + (false, true) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive, + (true, false) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive, + (true, true) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive, + }; + + // Non-compiled code will be using CharInClass with a cache, so initialize that cache. + if (!compiled) + { + _asciiLookups = new uint[FixedDistanceSets.Count][]; + } + } + } + return; + } + + // We're now left-to-right only and looking for sets. + + // Build up a list of all of the sets that are a fixed distance from the start of the expression. + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(tree, culture, thorough: !interpreter); + if (fixedDistanceSets is not null) + { + Debug.Assert(fixedDistanceSets.Count != 0); + + // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines + // don't need to special-case literals as they already do codegen to create the optimal lookup based on + // the set's characteristics. + if (!compiled && + fixedDistanceSets.Count == 1 && + fixedDistanceSets[0].Chars is { Length: 1 }) + { + FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], fixedDistanceSets[0].Distance); + FindMode = fixedDistanceSets[0].CaseInsensitive ? + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive : + FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive; + } + else + { + // Limit how many sets we use to avoid doing lots of unnecessary work. The list was already + // sorted from best to worst, so just keep the first ones up to our limit. + const int MaxSetsToUse = 3; // arbitrary tuned limit + if (fixedDistanceSets.Count > MaxSetsToUse) + { + fixedDistanceSets.RemoveRange(MaxSetsToUse, fixedDistanceSets.Count - MaxSetsToUse); + } + + // Store the sets, and compute which mode to use. + FixedDistanceSets = fixedDistanceSets; + FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0, fixedDistanceSets[0].CaseInsensitive) switch + { + (true, true) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive, + (true, false) => FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive, + (false, true) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive, + (false, false) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive, + }; + + // Non-compiled code will be using CharInClass with a cache, so initialize that cache. + if (!compiled) + { + _asciiLookups = new uint[FixedDistanceSets.Count][]; + } + } + return; + } + } + + /// Gets the selected mode for performing the next operation + public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch; + + /// Gets the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc). + public int LeadingAnchor { get; } + + /// Gets the leading prefix. May be an empty string. + public string LeadingCaseSensitivePrefix { get; } = string.Empty; + + /// When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern. + public (char Literal, int Distance) FixedDistanceLiteral { get; } + + /// When in fixed distance set mode, gets the set and how far it is from the start of the pattern. + /// The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not. + public List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FixedDistanceSets { get; } + + /// Try to advance to the next starting position that might be a location for a match. + /// The text to search. + /// The position in . This is updated with the found position. + /// The index in to consider the beginning for beginning anchor purposes. + /// The index in to consider the start for start anchor purposes. + /// The index in to consider the non-inclusive end of the string. + /// true if a position to attempt a match was found; false if none was found. + public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, int start, int end) + { + // Return early if we know there's not enough input left to match. + if (!_rightToLeft) + { + if (pos > end - _minRequiredLength) + { + pos = end; + return false; + } + } + else + { + if (pos - _minRequiredLength < beginning) + { + pos = beginning; + return false; + } + } + + // Optimize the handling of a Beginning-Of-Line (BOL) anchor (only for left-to-right). BOL is special, in that unlike + // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike + // the other anchors, which all skip all subsequent processing if found, with BOL we just use it + // to boost our position to the next line, and then continue normally with any searches. + if (LeadingAnchor == RegexPrefixAnalyzer.Bol) + { + // If we're not currently positioned at the beginning of a line (either + // the beginning of the string or just after a line feed), find the next + // newline and position just after it. + Debug.Assert(!_rightToLeft); + if (pos > beginning && text[pos - 1] != '\n') + { + int newline = text.IndexOf('\n', pos); + if (newline == -1 || newline + 1 > end) + { + pos = end; + return false; + } + + pos = newline + 1; + } + } + + switch (FindMode) + { + // There's an anchor. For some, we can simply compare against the current position. + // For others, we can jump to the relevant location. + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: + if (pos > beginning) + { + pos = end; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: + if (pos > start) + { + pos = end; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: + if (pos < end - 1) + { + pos = end - 1; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: + if (pos < end) + { + pos = end; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + if (pos > beginning) + { + pos = beginning; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + if (pos < start) + { + pos = beginning; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + if (pos < end - 1 || (pos == end - 1 && text[pos] != '\n')) + { + pos = beginning; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + if (pos < end) + { + pos = beginning; + return false; + } + return true; + + // There's a case-sensitive prefix. Search for it with ordinal IndexOf. + + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + { + int i = text.AsSpan(pos, end - pos).IndexOf(LeadingCaseSensitivePrefix.AsSpan()); + if (i >= 0) + { + pos += i; + return true; + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + { + int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(LeadingCaseSensitivePrefix.AsSpan()); + if (i >= 0) + { + pos = beginning + i + LeadingCaseSensitivePrefix.Length; + return true; + } + + pos = beginning; + return false; + } + + // There's a literal at the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive: + { + int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(FixedDistanceLiteral.Literal); + if (i >= 0) + { + pos = beginning + i + 1; + return true; + } + + pos = beginning; + return false; + } + + case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseInsensitive: + { + char ch = FixedDistanceLiteral.Literal; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + for (int i = span.Length - 1; i >= 0; i--) + { + if (ti.ToLower(span[i]) == ch) + { + pos = beginning + i + 1; + return true; + } + } + + pos = beginning; + return false; + } + + // There's a set at the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + { + (char[]? chars, string set, _, _) = FixedDistanceSets![0]; + + ReadOnlySpan span = text.AsSpan(pos, end - pos); + if (chars is not null) + { + int i = span.IndexOfAny(chars); + if (i >= 0) + { + pos += i; + return true; + } + } + else + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + for (int i = 0; i < span.Length; i++) + { + if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) + { + pos += i; + return true; + } + } + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(pos, end - pos); + for (int i = 0; i < span.Length; i++) + { + if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup)) + { + pos += i; + return true; + } + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + + ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + for (int i = span.Length - 1; i >= 0; i--) + { + if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) + { + pos = beginning + i + 1; + return true; + } + } + + pos = beginning; + return false; + } + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + for (int i = span.Length - 1; i >= 0; i--) + { + if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup)) + { + pos = beginning + i + 1; + return true; + } + } + + pos = beginning; + return false; + } + + // There's a literal at a fixed offset from the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive: + { + Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); + + int i = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal); + if (i >= 0) + { + pos += i; + return true; + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive: + { + Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); + + char ch = FixedDistanceLiteral.Literal; + TextInfo ti = _textInfo; + + ReadOnlySpan span = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance); + for (int i = 0; i < span.Length; i++) + { + if (ti.ToLower(span[i]) == ch) + { + pos += i; + return true; + } + } + + pos = end; + return false; + } + + // There are one or more sets at fixed offsets from the start of the pattern. + + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + { + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!; + (char[]? primaryChars, string primarySet, int primaryDistance, _) = sets[0]; + int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength); + + if (primaryChars is not null) + { + for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) + { + int offset = inputPosition + primaryDistance; + int index = text.IndexOfAny(primaryChars, offset, end - offset); + if (index < 0) + { + break; + } + + inputPosition = index - primaryDistance; + if (inputPosition > endMinusRequiredLength) + { + break; + } + + for (int i = 1; i < sets.Count; i++) + { + (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; + char c = text[inputPosition + nextDistance]; + if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) + { + goto Bumpalong; + } + } + + pos = inputPosition; + return true; + + Bumpalong:; + } + } + else + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + + for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) + { + char c = text[inputPosition + primaryDistance]; + if (!RegexCharClass.CharInClass(c, primarySet, ref startingAsciiLookup)) + { + goto Bumpalong; + } + + for (int i = 1; i < sets.Count; i++) + { + (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; + c = text[inputPosition + nextDistance]; + if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) + { + goto Bumpalong; + } + } + + pos = inputPosition; + return true; + + Bumpalong:; + } + } + + pos = end; + return false; + } + + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + { + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!; + (_, string primarySet, int primaryDistance, _) = sets[0]; + + int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength); + TextInfo ti = _textInfo; + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + + for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) + { + char c = text[inputPosition + primaryDistance]; + if (!RegexCharClass.CharInClass(ti.ToLower(c), primarySet, ref startingAsciiLookup)) + { + goto Bumpalong; + } + + for (int i = 1; i < sets.Count; i++) + { + (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; + c = text[inputPosition + nextDistance]; + if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) + { + goto Bumpalong; + } + } + + pos = inputPosition; + return true; + + Bumpalong:; + } + + pos = end; + return false; + } + + // Nothing special to look for. Just return true indicating this is a valid position to try to match. + + default: + Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch); + return true; + } + } + } + + /// Mode to use for searching for the next location of a possible match. + internal enum FindNextStartingPositionMode + { + /// A "beginning" anchor at the beginning of the pattern. + LeadingAnchor_LeftToRight_Beginning, + /// A "start" anchor at the beginning of the pattern. + LeadingAnchor_LeftToRight_Start, + /// An "endz" anchor at the beginning of the pattern. This is rare. + LeadingAnchor_LeftToRight_EndZ, + /// An "end" anchor at the beginning of the pattern. This is rare. + LeadingAnchor_LeftToRight_End, + + /// A "beginning" anchor at the beginning of the right-to-left pattern. + LeadingAnchor_RightToLeft_Beginning, + /// A "start" anchor at the beginning of the right-to-left pattern. + LeadingAnchor_RightToLeft_Start, + /// An "endz" anchor at the beginning of the right-to-left pattern. This is rare. + LeadingAnchor_RightToLeft_EndZ, + /// An "end" anchor at the beginning of the right-to-left pattern. This is rare. + LeadingAnchor_RightToLeft_End, + + /// A case-sensitive multi-character substring at the beginning of the pattern. + LeadingPrefix_LeftToRight_CaseSensitive, + /// A case-sensitive multi-character substring at the beginning of the right-to-left pattern. + LeadingPrefix_RightToLeft_CaseSensitive, + + /// A case-sensitive set starting the pattern. + LeadingSet_LeftToRight_CaseSensitive, + /// A case-insensitive set starting the pattern. + LeadingSet_LeftToRight_CaseInsensitive, + /// A case-sensitive set starting the right-to-left pattern. + LeadingSet_RightToLeft_CaseSensitive, + /// A case-insensitive set starting the right-to-left pattern. + LeadingSet_RightToLeft_CaseInsensitive, + + /// A case-sensitive single character at a fixed distance from the start of the right-to-left pattern. + LeadingLiteral_RightToLeft_CaseSensitive, + /// A case-insensitive single character at a fixed distance from the start of the right-to-left pattern. + LeadingLiteral_RightToLeft_CaseInsensitive, + + /// A case-sensitive single character at a fixed distance from the start of the pattern. + FixedLiteral_LeftToRight_CaseSensitive, + /// A case-insensitive single character at a fixed distance from the start of the pattern. + FixedLiteral_LeftToRight_CaseInsensitive, + + /// One or more sets at a fixed distance from the start of the pattern. At least the first set is case-sensitive. + FixedSets_LeftToRight_CaseSensitive, + /// One or more sets at a fixed distance from the start of the pattern. At least the first set is case-insensitive. + FixedSets_LeftToRight_CaseInsensitive, + + /// Nothing to search for. Nop. + NoSearch, + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index c679c0101d6f2..4351473b96fdb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -15,7 +15,6 @@ internal sealed class RegexInterpreter : RegexRunner private readonly RegexCode _code; private readonly TextInfo _textInfo; - private readonly FindFirstCharMode _findFirstCharMode; private int _operator; private int _codepos; @@ -29,48 +28,6 @@ public RegexInterpreter(RegexCode code, CultureInfo culture) _code = code; _textInfo = culture.TextInfo; - - // Determine what searching mode FindFirstChar will employ. - if ((_code.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End)) != 0) - { - _findFirstCharMode = (_code.LeadingAnchor, code.RightToLeft) switch - { - (RegexPrefixAnalyzer.Beginning, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_Beginning, - (RegexPrefixAnalyzer.Beginning, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_Beginning, - (RegexPrefixAnalyzer.Start, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_Start, - (RegexPrefixAnalyzer.Start, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_Start, - (RegexPrefixAnalyzer.End, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_End, - (RegexPrefixAnalyzer.End, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_End, - (_, false) => FindFirstCharMode.LeadingAnchor_LeftToRight_EndZ, - (_, true) => FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ, - }; - } - else if (code.BoyerMoorePrefix is RegexBoyerMoore rbm) - { - _findFirstCharMode = rbm.PatternSupportsIndexOf ? - FindFirstCharMode.IndexOf : - FindFirstCharMode.BoyerMoore; - } - else if (code.LeadingCharClasses is not null) - { - (string charClass, bool caseInsensitive) = code.LeadingCharClasses[0]; - bool isSet = !RegexCharClass.IsSingleton(charClass); - _findFirstCharMode = (code.RightToLeft, caseInsensitive, isSet) switch - { - (false, false, false) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Singleton, - (false, false, true) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Set, - (false, true, false) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Singleton, - (false, true, true) => FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Set, - (true, false, false) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Singleton, - (true, false, true) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Set, - (true, true, false) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Singleton, - (true, true, true) => FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Set, - }; - } - else - { - _findFirstCharMode = FindFirstCharMode.NoSearch; - } } protected override void InitTrackCount() => runtrackcount = _code.TrackCount; @@ -372,306 +329,8 @@ private bool MatchRef(int index, int length) private void Backwardnext() => runtextpos += _rightToLeft ? 1 : -1; - private enum FindFirstCharMode - { - LeadingAnchor_LeftToRight_Beginning, - LeadingAnchor_LeftToRight_Start, - LeadingAnchor_LeftToRight_EndZ, - LeadingAnchor_LeftToRight_End, - - LeadingAnchor_RightToLeft_Beginning, - LeadingAnchor_RightToLeft_Start, - LeadingAnchor_RightToLeft_EndZ, - LeadingAnchor_RightToLeft_End, - - IndexOf, - BoyerMoore, - - LeadingCharClass_LeftToRight_CaseSensitive_Singleton, - LeadingCharClass_LeftToRight_CaseSensitive_Set, - LeadingCharClass_LeftToRight_CaseInsensitive_Singleton, - LeadingCharClass_LeftToRight_CaseInsensitive_Set, - - LeadingCharClass_RightToLeft_CaseSensitive_Singleton, - LeadingCharClass_RightToLeft_CaseSensitive_Set, - LeadingCharClass_RightToLeft_CaseInsensitive_Singleton, - LeadingCharClass_RightToLeft_CaseInsensitive_Set, - - NoSearch, - } - - protected override bool FindFirstChar() - { - // Return early if we know there's not enough input left to match. - if (!_code.RightToLeft) - { - if (runtextpos > runtextend - _code.Tree.MinRequiredLength) - { - runtextpos = runtextend; - return false; - } - } - else - { - if (runtextpos - _code.Tree.MinRequiredLength < runtextbeg) - { - runtextpos = runtextbeg; - return false; - } - } - - // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike - // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike - // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any Boyer-Moore or - // leading char class searches. - if (_code.LeadingAnchor == RegexPrefixAnalyzer.Bol && - !_code.RightToLeft) // don't bother customizing this optimization for the very niche RTL + Multiline case - { - // If we're not currently positioned at the beginning of a line (either - // the beginning of the string or just after a line feed), find the next - // newline and position just after it. - if (runtextpos > runtextbeg && runtext![runtextpos - 1] != '\n') - { - int newline = runtext.IndexOf('\n', runtextpos); - if (newline == -1 || newline + 1 > runtextend) - { - runtextpos = runtextend; - return false; - } - - runtextpos = newline + 1; - } - } - - switch (_findFirstCharMode) - { - // If the pattern is anchored, we can update our position appropriately and return immediately. - // If there's a Boyer-Moore prefix, we can also validate it. - - case FindFirstCharMode.LeadingAnchor_LeftToRight_Beginning: - if (runtextpos > runtextbeg) - { - runtextpos = runtextend; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_LeftToRight_Start: - if (runtextpos > runtextstart) - { - runtextpos = runtextend; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_LeftToRight_EndZ: - if (runtextpos < runtextend - 1) - { - runtextpos = runtextend - 1; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_LeftToRight_End: - if (runtextpos < runtextend) - { - runtextpos = runtextend; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_Beginning: - if (runtextpos > runtextbeg) - { - runtextpos = runtextbeg; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_Start: - if (runtextpos < runtextstart) - { - runtextpos = runtextbeg; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_EndZ: - if (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && runtext![runtextpos] != '\n')) - { - runtextpos = runtextbeg; - return false; - } - return NoPrefixOrPrefixMatches(); - - case FindFirstCharMode.LeadingAnchor_RightToLeft_End: - if (runtextpos < runtextend) - { - runtextpos = runtextbeg; - return false; - } - return NoPrefixOrPrefixMatches(); - - // There was a prefix. Scan for it. - - case FindFirstCharMode.IndexOf: - { - int i = runtext.AsSpan(runtextpos, runtextend - runtextpos).IndexOf(_code.BoyerMoorePrefix!.Pattern); - if (i >= 0) - { - runtextpos += i; - return true; - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.BoyerMoore: - runtextpos = _code.BoyerMoorePrefix!.Scan(runtext!, runtextpos, runtextbeg, runtextend); - if (runtextpos >= 0) - { - return true; - } - runtextpos = _code.RightToLeft ? runtextbeg : runtextend; - return false; - - // There's a leading character class. Search for it. - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Singleton: - { - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - int i = span.IndexOf(RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass)); - if (i >= 0) - { - runtextpos += i; - return true; - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseSensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - for (int i = 0; i < span.Length; i++) - { - if (RegexCharClass.CharInClass(span[i], set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos += i; - return true; - } - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Singleton: - { - char ch = RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass); - TextInfo ti = _textInfo; - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - for (int i = 0; i < span.Length; i++) - { - if (ch == ti.ToLower(span[i])) - { - runtextpos += i; - return true; - } - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_LeftToRight_CaseInsensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextpos, runtextend - runtextpos); - TextInfo ti = _textInfo; - for (int i = 0; i < span.Length; i++) - { - if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos += i; - return true; - } - } - runtextpos = runtextend; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Singleton: - { - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - int i = span.LastIndexOf(RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass)); - if (i >= 0) - { - runtextpos = runtextbeg + i + 1; - return true; - } - runtextpos = runtextbeg; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseSensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - for (int i = span.Length - 1; i >= 0; i--) - { - if (RegexCharClass.CharInClass(span[i], set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos = runtextbeg + i + 1; - return true; - } - } - runtextpos = runtextbeg; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Singleton: - { - char ch = RegexCharClass.SingletonChar(_code.LeadingCharClasses![0].CharClass); - TextInfo ti = _textInfo; - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - for (int i = span.Length - 1; i >= 0; i--) - { - if (ch == ti.ToLower(span[i])) - { - runtextpos = runtextbeg + i + 1; - return true; - } - } - runtextpos = runtextbeg; - return false; - } - - case FindFirstCharMode.LeadingCharClass_RightToLeft_CaseInsensitive_Set: - { - string set = _code.LeadingCharClasses![0].CharClass; - ReadOnlySpan span = runtext.AsSpan(runtextbeg, runtextpos - runtextbeg); - TextInfo ti = _textInfo; - for (int i = span.Length - 1; i >= 0; i--) - { - if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref _code.LeadingCharClassAsciiLookup)) - { - runtextpos = runtextbeg + i + 1; - return true; - } - } - runtextpos = runtextbeg; - return false; - } - - // Nothing special to look for. Just return true indicating this is a valid position to try to match. - - default: - Debug.Assert(_findFirstCharMode == FindFirstCharMode.NoSearch); - return true; - } - - bool NoPrefixOrPrefixMatches() => - _code.BoyerMoorePrefix is not RegexBoyerMoore rbm || - rbm.IsMatch(runtext!, runtextpos, runtextbeg, runtextend); - } + protected override bool FindFirstChar() => + _code.FindOptimizations.TryFindNextStartingPosition(runtext!, ref runtextpos, runtextbeg, runtextstart, runtextend); protected override void Go() { @@ -1230,7 +889,7 @@ protected override void Go() int operand0 = Operand(0); string set = _code.Strings[operand0]; - ref int[]? setLookup = ref _code.StringsAsciiLookup[operand0]; + ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0]; while (c-- > 0) { @@ -1322,7 +981,7 @@ protected override void Go() int len = Math.Min(Operand(1), Forwardchars()); int operand0 = Operand(0); string set = _code.Strings[operand0]; - ref int[]? setLookup = ref _code.StringsAsciiLookup[operand0]; + ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0]; int i; for (i = len; i > 0; i--) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 8ed30bbcb266b..53b78c5d32479 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -39,9 +39,6 @@ public RegexRunnerFactory FactoryInstanceFromCode(string pattern, RegexCode code _code = code; _codes = code.Codes; _strings = code.Strings; - _leadingCharClasses = code.LeadingCharClasses; - _boyerMoorePrefix = code.BoyerMoorePrefix; - _leadingAnchor = code.LeadingAnchor; _trackcount = code.TrackCount; _options = options; _hasTimeout = hasTimeout; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index c23bb50720a05..e7b0e71076df1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -41,6 +41,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Globalization; using System.Threading; namespace System.Text.RegularExpressions @@ -151,11 +152,49 @@ public RegexNode(int type, RegexOptions options, int m, int n) N = n; } - public bool UseOptionR() => (Options & RegexOptions.RightToLeft) != 0; + /// Creates a RegexNode representing a single character. + /// The character. + /// The node's options. + /// The culture to use to perform any required transformations. + /// The created RegexNode. This might be a RegexNode.One or a RegexNode.Set. + public static RegexNode CreateOneWithCaseConversion(char ch, RegexOptions options, CultureInfo? culture) + { + // If the options specify case-insensitivity, we try to create a node that fully encapsulates that. + if ((options & RegexOptions.IgnoreCase) != 0) + { + Debug.Assert(culture is not null); + + // If the character is part of a Unicode category that doesn't participate in case conversion, + // we can simply strip out the IgnoreCase option and make the node case-sensitive. + if (!RegexCharClass.ParticipatesInCaseConversion(ch)) + { + return new RegexNode(One, options & ~RegexOptions.IgnoreCase, ch); + } + + // Create a set for the character, trying to include all case-insensitive equivalent characters. + // If it's successful in doing so, resultIsCaseInsensitive will be false and we can strip + // out RegexOptions.IgnoreCase as part of creating the set. + string stringSet = RegexCharClass.OneToStringClass(ch, culture, out bool resultIsCaseInsensitive); + if (!resultIsCaseInsensitive) + { + return new RegexNode(Set, options & ~RegexOptions.IgnoreCase, stringSet); + } + + // Otherwise, until we can get rid of ToLower usage at match time entirely (https://github.com/dotnet/runtime/issues/61048), + // lowercase the character and proceed to create an IgnoreCase One node. + ch = culture.TextInfo.ToLower(ch); + } + + // Create a One node for the character. + return new RegexNode(One, options, ch); + } - public RegexNode ReverseLeft() + /// Reverses all children of a concatenation when in RightToLeft mode. + public RegexNode ReverseConcatenationIfRightToLeft() { - if (UseOptionR() && Type == Concatenate && ChildCount() > 1) + if ((Options & RegexOptions.RightToLeft) != 0 && + Type == Concatenate && + ChildCount() > 1) { ((List)Children!).Reverse(); } @@ -203,13 +242,26 @@ private void ValidateFinalTreeInvariants() { RegexNode node = toExamine.Pop(); + // Add all children to be examined + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + RegexNode child = node.Child(i); + Debug.Assert(child.Next == node, $"{child.Description()} missing reference to parent {node.Description()}"); + + toExamine.Push(child); + } + // Validate that we never see certain node types. Debug.Assert(Type != Group, "All Group nodes should have been removed."); - // Validate expected child counts. - int childCount = node.ChildCount(); + // Validate node types and expected child counts. switch (node.Type) { + case Group: + Debug.Fail("All Group nodes should have been removed."); + break; + case Beginning: case Bol: case Boundary: @@ -247,25 +299,20 @@ private void ValidateFinalTreeInvariants() case Prevent: case Require: Debug.Assert(childCount == 1, $"Expected one and only one child for {node.TypeName}, got {childCount}."); - toExamine.Push(node.Child(0)); break; case Testref: case Testgroup: Debug.Assert(childCount >= 1, $"Expected at least one child for {node.TypeName}, got {childCount}."); - for (int i = 0; i < childCount; i++) - { - toExamine.Push(node.Child(i)); - } break; case Concatenate: case Alternate: Debug.Assert(childCount >= 2, $"Expected at least two children for {node.TypeName}, got {childCount}."); - for (int i = 0; i < childCount; i++) - { - toExamine.Push(node.Child(i)); - } + break; + + default: + Debug.Fail($"Unexpected node type: {node.Type}"); break; } @@ -273,6 +320,10 @@ private void ValidateFinalTreeInvariants() switch (node.Type) { case Multi: + Debug.Assert(node.Str is not null, "Expect non-null multi string"); + Debug.Assert(node.Str.Length >= 2, $"Expected {node.Str} to be at least two characters"); + break; + case Set: case Setloop: case Setloopatomic: @@ -881,8 +932,10 @@ private RegexNode ReduceAlternation() default: ReduceSingleLetterAndNestedAlternations(); - RegexNode newThis = ReplaceNodeIfUnnecessary(Nothing); - return newThis != this ? newThis : ExtractCommonPrefixes(); + RegexNode node = ReplaceNodeIfUnnecessary(Nothing); + node = ExtractCommonPrefixText(node); + node = ExtractCommonPrefixOneNotoneSet(node); + return node; } // This function performs two optimizations: @@ -952,7 +1005,6 @@ void ReduceSingleLetterAndNestedAlternations() break; } - // The last node was a Set or a One, we're a Set or One and our options are the same. // Merge the two nodes. j--; @@ -981,6 +1033,12 @@ void ReduceSingleLetterAndNestedAlternations() prev.Type = Set; prev.Str = prevCharClass.ToStringClass(Options); + if ((prev.Options & RegexOptions.IgnoreCase) != 0 && + RegexCharClass.MakeCaseSensitiveIfPossible(prev.Str, RegexParser.GetTargetCulture(prev.Options)) is string newSetString) + { + prev.Str = newSetString; + prev.Options &= ~RegexOptions.IgnoreCase; + } } else if (at.Type == Nothing) { @@ -1001,6 +1059,106 @@ void ReduceSingleLetterAndNestedAlternations() } } + // This function optimizes out prefix nodes from alternation branches that are + // the same across multiple contiguous branches. + // e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90) + static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation) + { + if (alternation.Type != Alternate) + { + return alternation; + } + + Debug.Assert(alternation.Children is List { Count: >= 2 }); + var children = (List)alternation.Children; + + // Only process left-to-right prefixes. + if ((alternation.Options & RegexOptions.RightToLeft) != 0) + { + return alternation; + } + + // Only handle the case where each branch is a concatenation + foreach (RegexNode child in children) + { + if (child.Type != Concatenate || child.ChildCount() < 2) + { + return alternation; + } + } + + for (int startingIndex = 0; startingIndex < children.Count - 1; startingIndex++) + { + Debug.Assert(children[startingIndex].Children is List { Count: >= 2 }); + + // Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop). + // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing + // it for non-atomic variable length loops could change behavior as each branch could otherwise have a + // different number of characters consumed by the loop based on what's after it. + RegexNode required = children[startingIndex].Child(0); + switch (required.Type) + { + case One or Notone or Set: + case Oneloopatomic or Notoneloopatomic or Setloopatomic: + case Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy when required.M == required.N: + break; + + default: + continue; + } + + // Only handle the case where each branch begins with the exact same node value + int endingIndex = startingIndex + 1; + for (; endingIndex < children.Count; endingIndex++) + { + RegexNode other = children[endingIndex].Child(0); + if (required.Type != other.Type || + required.Options != other.Options || + required.M != other.M || + required.N != other.N || + required.Ch != other.Ch || + required.Str != other.Str) + { + break; + } + } + + if (endingIndex - startingIndex <= 1) + { + // Nothing to extract from this starting index. + continue; + } + + // Remove the prefix node from every branch, adding it to a new alternation + var newAlternate = new RegexNode(Alternate, alternation.Options); + for (int i = startingIndex; i < endingIndex; i++) + { + ((List)children[i].Children!).RemoveAt(0); + newAlternate.AddChild(children[i]); + } + + // If this alternation is wrapped as atomic, we need to do the same for the new alternation. + if (alternation.Next is RegexNode parent && parent.Type == Atomic) + { + var atomic = new RegexNode(Atomic, alternation.Options); + atomic.AddChild(newAlternate); + newAlternate = atomic; + } + + // Now create a concatenation of the prefix node with the new alternation for the combined + // branches, and replace all of the branches in this alternation with that new concatenation. + var newConcat = new RegexNode(Concatenate, alternation.Options); + newConcat.AddChild(required); + newConcat.AddChild(newAlternate); + alternation.ReplaceChild(startingIndex, newConcat); + children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1); + } + + // If we've reduced this alternation to just a single branch, return it. + // Otherwise, return the alternation. + return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation; + } + // Analyzes all the branches of the alternation for text that's identical at the beginning // of every branch. That text is then pulled out into its own one or multi node in a // concatenation with the alternation (whose branches are updated to remove that prefix). @@ -1010,22 +1168,25 @@ void ReduceSingleLetterAndNestedAlternations() // by sets that can be merged. Third, it reduces the amount of duplicated comparisons required // if we end up backtracking into subsequent branches. // e.g. abc|ade => a(?bc|de) - RegexNode ExtractCommonPrefixes() + static RegexNode ExtractCommonPrefixText(RegexNode alternation) { + if (alternation.Type != Alternate) + { + return alternation; + } + + Debug.Assert(alternation.Children is List { Count: >= 2 }); + var children = (List)alternation.Children; + // To keep things relatively simple, we currently only handle: // - Left to right (e.g. we don't process alternations in lookbehinds) // - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes. // - All branches having the same options. - // - Text, rather than also trying to combine identical sets that start each branch. - - Debug.Assert(Children is List); - var children = (List)Children; - Debug.Assert(children.Count >= 2); // Only extract left-to-right prefixes. - if ((Options & RegexOptions.RightToLeft) != 0) + if ((alternation.Options & RegexOptions.RightToLeft) != 0) { - return this; + return alternation; } Span scratchChar = stackalloc char[1]; @@ -1036,7 +1197,7 @@ RegexNode ExtractCommonPrefixes() RegexNode? startingNode = children[startingIndex].FindBranchOneOrMultiStart(); if (startingNode is null) { - return this; + return alternation; } RegexOptions startingNodeOptions = startingNode.Options; @@ -1159,7 +1320,7 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) } } - if (Next is RegexNode parent && parent.Type == Atomic) + if (alternation.Next is RegexNode parent && parent.Type == Atomic) { var atomic = new RegexNode(Atomic, startingNodeOptions); atomic.AddChild(newAlternate); @@ -1169,11 +1330,11 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) var newConcat = new RegexNode(Concatenate, startingNodeOptions); newConcat.AddChild(prefix); newConcat.AddChild(newAlternate); - ReplaceChild(startingIndex, newConcat); + alternation.ReplaceChild(startingIndex, newConcat); children.RemoveRange(startingIndex + 1, endingIndex - startingIndex - 1); } - return ChildCount() == 1 ? Child(0) : this; + return alternation.ChildCount() == 1 ? alternation.Child(0) : alternation; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 1f4a05afa47c1..0bda8a2367ed6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -81,6 +81,10 @@ private RegexParser(string pattern, RegexOptions options, CultureInfo culture, S { } + /// Gets the culture to use based on the specified options. + internal static CultureInfo GetTargetCulture(RegexOptions options) => + (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture) { var parser = new RegexParser(pattern, options, culture, stackalloc int[OptionStackDefaultSize]); @@ -319,7 +323,12 @@ private RegexNode ScanRegex() goto ContinueOuterScan; case '[': - AddUnitSet(ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options)); + { + string setString = ScanCharClass(UseOptionI(), scanOnly: false)!.ToStringClass(_options); + _unit = UseOptionI() && RegexCharClass.MakeCaseSensitiveIfPossible(setString, _culture) is string newSetString ? + new RegexNode(RegexNode.Set, _options & ~RegexOptions.IgnoreCase, newSetString) : + new RegexNode(RegexNode.Set, _options, setString); + } break; case '(': @@ -378,14 +387,9 @@ private RegexNode ScanRegex() break; case '.': - if (UseOptionS()) - { - AddUnitSet(RegexCharClass.AnyClass); - } - else - { - AddUnitNotone('\n'); - } + _unit = UseOptionS() ? + new RegexNode(RegexNode.Set, _options & ~RegexOptions.IgnoreCase, RegexCharClass.AnyClass) : + new RegexNode(RegexNode.Notone, _options & ~RegexOptions.IgnoreCase, '\n'); break; case '{': @@ -734,21 +738,17 @@ node.M is not (0 or RegexReplacement.LeftPortion or RegexReplacement.RightPortio { // we aren't in a range, and now there is a subtraction. Usually this happens // only when a subtraction follows a range, like [a-z-[b]] + MoveRight(); + RegexCharClass? rcc = ScanCharClass(caseInsensitive, scanOnly); if (!scanOnly) { - MoveRight(1); - charClass!.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly)!); + charClass!.AddSubtraction(rcc!); if (CharsRight() > 0 && RightChar() != ']') { throw MakeException(RegexParseError.ExclusionGroupNotLast, SR.ExclusionGroupNotLast); } } - else - { - MoveRight(1); - ScanCharClass(caseInsensitive, scanOnly); - } } else { @@ -1173,32 +1173,32 @@ private void ScanBlank() case 'w': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMAWordClass : RegexCharClass.WordClass); case 'W': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMAWordClass : RegexCharClass.NotWordClass); case 's': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMASpaceClass : RegexCharClass.SpaceClass); case 'S': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMASpaceClass : RegexCharClass.NotSpaceClass); case 'd': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.ECMADigitClass : RegexCharClass.DigitClass); case 'D': MoveRight(); return scanOnly ? null : - new RegexNode(RegexNode.Set, _options, UseOptionE() ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass); + new RegexNode(RegexNode.Set, RemoveIgnoreCaseIfNotEcma(_options), UseOptionE() ? RegexCharClass.NotECMADigitClass : RegexCharClass.NotDigitClass); case 'p': case 'P': @@ -1220,6 +1220,22 @@ private void ScanBlank() default: return ScanBasicBackslash(scanOnly); } + + static RegexOptions RemoveIgnoreCaseIfNotEcma(RegexOptions options) + { + // This function is used for \w, \W, \d, \D, \s, and \S to remove IgnoreCase, + // since they already include the notion of casing in their definitions. + // However, for compatibility, if ECMAScript is specified, we avoid stripping + // out the IgnoreCase. We should revisit this as part of https://github.com/dotnet/runtime/issues/61048, + // as it seems wrong that specifying ECMAScript (which implies non-Unicode) would + // then still involve lowercasing potentially Unicode character inputs to match + // against these sets. + if ((options & RegexOptions.ECMAScript) == 0) + { + options &= ~RegexOptions.IgnoreCase; + } + return options; + } } /// Scans \-style backreferences and character escapes @@ -1354,12 +1370,9 @@ private void ScanBlank() Textto(backpos); ch = ScanCharEscape(); - if (UseOptionI()) - { - ch = _culture.TextInfo.ToLower(ch); - } - - return scanOnly ? null : new RegexNode(RegexNode.One, _options, ch); + return !scanOnly ? + RegexNode.CreateOneWithCaseConversion(ch, _options, _culture) : + null; } /* @@ -1369,7 +1382,7 @@ private RegexNode ScanDollar() { if (CharsRight() == 0) { - return new RegexNode(RegexNode.One, _options, '$'); + return RegexNode.CreateOneWithCaseConversion('$', _options, _culture); } char ch = RightChar(); @@ -1469,7 +1482,7 @@ private RegexNode ScanDollar() { case '$': MoveRight(); - return new RegexNode(RegexNode.One, _options, '$'); + return RegexNode.CreateOneWithCaseConversion('$', _options, _culture); case '&': capnum = 0; @@ -1502,7 +1515,7 @@ private RegexNode ScanDollar() // unrecognized $: literalize Textto(backpos); - return new RegexNode(RegexNode.One, _options, '$'); + return RegexNode.CreateOneWithCaseConversion('$', _options, _culture); } /// Throws on unsupported capture references for NonBacktracking in replacement patterns. @@ -2149,50 +2162,26 @@ private bool IsTrueQuantifier() /// Add a string to the last concatenate. private void AddConcatenate(int pos, int cch, bool isReplacement) { - if (cch == 0) + switch (cch) { - return; - } + case 0: + return; - RegexNode node; - if (cch > 1) - { - string str = UseOptionI() && !isReplacement ? -#if REGEXGENERATOR - StringExtensions.Create -#else - string.Create -#endif - (cch, (_pattern, _culture, pos, cch), static (dest, state) => - { - // We do the ToLower character-by character for consistency with the rest of the implementation. - // With surrogate pairs, doing a ToLower on the entire string is more correct linguistically, but - // Regex doesn't support surrogates, and not doing this character-by-character then causes differences - // from matching where characters are lowercased individually. - ReadOnlySpan src = state._pattern.AsSpan(state.pos, state.cch); - TextInfo ti = state._culture.TextInfo; - for (int i = 0; i < dest.Length; i++) - { - dest[i] = ti.ToLower(src[i]); - } - }) : - _pattern.Substring(pos, cch); - - node = new RegexNode(RegexNode.Multi, _options, str); - } - else - { - char ch = _pattern[pos]; + case 1: + _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture)); + break; - if (UseOptionI() && !isReplacement) - { - ch = _culture.TextInfo.ToLower(ch); - } + case > 1 when !UseOptionI() || isReplacement: + _concatenation!.AddChild(new RegexNode(RegexNode.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch))); + break; - node = new RegexNode(RegexNode.One, _options, ch); + default: + foreach (char c in _pattern.AsSpan(pos, cch)) + { + _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(c, _options, _culture)); + } + break; } - - _concatenation!.AddChild(node); } /// Push the parser state (in response to an open paren) @@ -2243,11 +2232,11 @@ private void AddAlternate() if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref) { - _group.AddChild(_concatenation!.ReverseLeft()); + _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); } else { - _alternation!.AddChild(_concatenation!.ReverseLeft()); + _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); } _concatenation = new RegexNode(RegexNode.Concatenate, _options); @@ -2273,29 +2262,7 @@ private void AddConcatenate(bool lazy, int min, int max) private RegexNode? Unit() => _unit; /// Sets the current unit to a single char node - private void AddUnitOne(char ch) - { - if (UseOptionI()) - { - ch = _culture.TextInfo.ToLower(ch); - } - - _unit = new RegexNode(RegexNode.One, _options, ch); - } - - /// Sets the current unit to a single inverse-char node - private void AddUnitNotone(char ch) - { - if (UseOptionI()) - { - ch = _culture.TextInfo.ToLower(ch); - } - - _unit = new RegexNode(RegexNode.Notone, _options, ch); - } - - /// Sets the current unit to a single set node - private void AddUnitSet(string cc) => _unit = new RegexNode(RegexNode.Set, _options, cc); + private void AddUnitOne(char ch) => _unit = RegexNode.CreateOneWithCaseConversion(ch, _options, _culture); /// Sets the current unit to a subtree private void AddUnitNode(RegexNode node) => _unit = node; @@ -2308,7 +2275,7 @@ private void AddGroup() { if (_group!.Type == RegexNode.Testgroup || _group.Type == RegexNode.Testref) { - _group.AddChild(_concatenation!.ReverseLeft()); + _group.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); if (_group.Type == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3) { @@ -2317,7 +2284,7 @@ private void AddGroup() } else { - _alternation!.AddChild(_concatenation!.ReverseLeft()); + _alternation!.AddChild(_concatenation!.ReverseConcatenationIfRightToLeft()); _group.AddChild(_alternation); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 96a709b2338d4..22a2abba1946a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -5,6 +5,8 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; +using System.Runtime.CompilerServices; +using System.Threading; namespace System.Text.RegularExpressions { @@ -40,256 +42,518 @@ private RegexPrefixAnalyzer(Span intStack) _skipAllChildren = false; } - /// Computes the leading substring in . - /// It's quite trivial and gives up easily, in which case an empty string is returned. - public static (string Prefix, bool CaseInsensitive) ComputeLeadingSubstring(RegexTree tree) + /// Computes the leading substring in ; may be empty. + public static string FindCaseSensitivePrefix(RegexTree tree) { - RegexNode curNode = tree.Root; - RegexNode? concatNode = null; - int nextChild = 0; + var vsb = new ValueStringBuilder(stackalloc char[64]); + Process(tree.Root, ref vsb); + return vsb.ToString(); - while (true) + // Processes the node, adding any prefix text to the builder. + // Returns whether processing should continue with subsequent nodes. + static bool Process(RegexNode node, ref ValueStringBuilder vsb) { - switch (curNode.Type) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { + // If we're too deep on the stack, just give up finding any more prefix. + return false; + } + + // We don't bother to handle reversed input, so process at most one node + // when handling RightToLeft. + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + + switch (node.Type) + { + // Concatenation case RegexNode.Concatenate: - if (curNode.ChildCount() > 0) { - concatNode = curNode; - nextChild = 0; + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (!Process(node.Child(i), ref vsb)) + { + return false; + } + } + return !rtl; } - break; - case RegexNode.Atomic: - case RegexNode.Capture: - curNode = curNode.Child(0); - concatNode = null; - continue; + // Alternation: find a string that's a shared prefix of all branches + case RegexNode.Alternate: + { + int childCount = node.ChildCount(); - case RegexNode.Oneloop: - case RegexNode.Oneloopatomic: - case RegexNode.Onelazy: + // Store the initial branch into the target builder + int initialLength = vsb.Length; + bool keepExploring = Process(node.Child(0), ref vsb); + int addedLength = vsb.Length - initialLength; - // In release, cutoff at a length to which we can still reasonably construct a string and Boyer-Moore search. - // In debug, use a smaller cutoff to exercise the cutoff path in tests - const int Cutoff = -#if DEBUG - 50; -#else - RegexBoyerMoore.MaxLimit; -#endif + // Then explore the rest of the branches, finding the length + // a prefix they all share in common with the initial branch. + if (addedLength != 0) + { + var alternateSb = new ValueStringBuilder(64); - if (curNode.M > 0 && curNode.M < Cutoff) - { - return (new string(curNode.Ch, curNode.M), (curNode.Options & RegexOptions.IgnoreCase) != 0); - } + // Process each branch. If we reach a point where we've proven there's + // no overlap, we can bail early. + for (int i = 1; i < childCount && addedLength != 0; i++) + { + alternateSb.Length = 0; + + // Process the branch. We want to keep exploring after this alternation, + // but we can't if either this branch doesn't allow for it or if the prefix + // supplied by this branch doesn't entirely match all the previous ones. + keepExploring &= Process(node.Child(i), ref alternateSb); + keepExploring &= alternateSb.Length == addedLength; + + addedLength = Math.Min(addedLength, alternateSb.Length); + for (int j = 0; j < addedLength; j++) + { + if (vsb[initialLength + j] != alternateSb[j]) + { + addedLength = j; + keepExploring = false; + break; + } + } + } - return (string.Empty, false); + alternateSb.Dispose(); - case RegexNode.One: - return (curNode.Ch.ToString(), (curNode.Options & RegexOptions.IgnoreCase) != 0); + // Then cull back on what was added based on the other branches. + vsb.Length = initialLength + addedLength; + } - case RegexNode.Multi: - return (curNode.Str!, (curNode.Options & RegexOptions.IgnoreCase) != 0); + return !rtl && keepExploring; + } + // One character + case RegexNode.One when (node.Options & RegexOptions.IgnoreCase) == 0: + vsb.Append(node.Ch); + return !rtl; + + // Multiple characters + case RegexNode.Multi when (node.Options & RegexOptions.IgnoreCase) == 0: + vsb.Append(node.Str); + return !rtl; + + // Loop of one character + case RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy when node.M > 0 && (node.Options & RegexOptions.IgnoreCase) == 0: + const int SingleCharIterationLimit = 32; // arbitrary cut-off to avoid creating super long strings unnecessarily + int count = Math.Min(node.M, SingleCharIterationLimit); + vsb.Append(node.Ch, count); + return count == node.N && !rtl; + + // Loop of a node + case RegexNode.Loop or RegexNode.Lazyloop when node.M > 0: + { + const int NodeIterationLimit = 4; // arbitrary cut-off to avoid creating super long strings unnecessarily + int limit = Math.Min(node.M, NodeIterationLimit); + for (int i = 0; i < limit; i++) + { + if (!Process(node.Child(0), ref vsb)) + { + return false; + } + } + return limit == node.N && !rtl; + } + + // Grouping nodes for which we only care about their single child + case RegexNode.Atomic: + case RegexNode.Capture: + return Process(node.Child(0), ref vsb); + + // Zero-width anchors and assertions case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: + case RegexNode.NonBoundary: + case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: + case RegexNode.UpdateBumpalong: case RegexNode.Require: case RegexNode.Prevent: - break; + return true; + // Give up for anything else default: - return (string.Empty, false); + return false; } - - if (concatNode == null || nextChild >= concatNode.ChildCount()) - { - return (string.Empty, false); - } - - curNode = concatNode.Child(nextChild++); } } - /// Computes a character class for the first character in . - /// true if a character class could be computed; otherwise, false. - public static (string CharClass, bool CaseInsensitive)[]? ComputeFirstCharClass(RegexTree tree) + /// Finds sets at fixed-offsets from the beginning of the pattern/ + /// The RegexNode tree. + /// The culture to use for any case conversions. + /// true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete. + /// The array of found sets, or null if there aren't any. + public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets( + RegexTree tree, CultureInfo culture, bool thorough) { - var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]); - RegexFC? fc = s.RegexFCFromRegexTree(tree); - s.Dispose(); + const int MaxLoopExpansion = 20; // arbitrary cut-off to avoid loops adding significant overhead to processing + const int MaxFixedResults = 50; // arbitrary cut-off to avoid generating lots of sets unnecessarily - if (fc == null || fc._nullable) + // Find all fixed-distance sets. + var results = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>(); + int distance = 0; + TryFindFixedSets(tree.Root, results, ref distance, culture, thorough); +#if DEBUG + foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) result in results) { - return null; + Debug.Assert(result.Distance <= tree.MinRequiredLength, $"Min: {tree.MinRequiredLength}, Distance: {result.Distance}, Tree: {tree}"); } +#endif - if (fc.CaseInsensitive) + // Remove any sets that match everything; they're not helpful. (This check exists primarily to weed + // out use of . in Singleline mode.) + bool hasAny = false; + for (int i = 0; i < results.Count; i++) { - fc.AddLowercase(((tree.Options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); + if (results[i].Set == RegexCharClass.AnyClass) + { + hasAny = true; + break; + } } - - return new[] { (fc.GetFirstChars(), fc.CaseInsensitive) }; - } - - /// Computes character classes for the first characters in . - /// - /// For example, given "hello|world" and a of 3, this will compute the sets [hw], [eo], and [lr]. - /// As with some of the other computations, it's quite trivial and gives up easily; for example, we could in - /// theory handle nodes in a concatenation after an alternation, but we look only at the branches of the - /// alternation itself. As this computation is intended primarily to handle global alternations, it's currently - /// a reasonable tradeoff between simplicity, performance, and the fullness of potential optimizations. - /// - public static (string CharClass, bool CaseInsensitive)[]? ComputeMultipleCharClasses(RegexTree tree, int maxChars) - { - Debug.Assert(maxChars > 1); - - if ((tree.Options & RegexOptions.RightToLeft) != 0) + if (hasAny) { - // We don't bother for RightToLeft. It's rare and adds non-trivial complication. - return null; + results.RemoveAll(s => s.Set == RegexCharClass.AnyClass); } - // The known minimum required length will have already factored in knowledge about alternations. - // If the known min length is less than the maximum number of chars requested, we can - // cut this short. If it's zero, there's nothing to be found. If it's one, we won't do - // any better than ComputeFirstCharClass (and likely worse). Otherwise, don't bother looking for more - // the min of the min length and the max requested chars. - maxChars = Math.Min(tree.MinRequiredLength, maxChars); - if (maxChars <= 1) + // If we don't have any results, try harder to compute one for the starting character. + // This is a more involved computation that can find things the fixed-distance investigation + // doesn't. + if (results.Count == 0) { - return null; + (string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(tree, culture); + if (first is not null) + { + results.Add((null, first.Value.CharClass, 0, first.Value.CaseInsensitive)); + } + + if (results.Count == 0) + { + return null; + } } - // Find an alternation on the path to the first node. If we can't, bail. - RegexNode node = tree.Root; - while (node.Type != RegexNode.Alternate) + // For every entry, see if we can mark any that are case-insensitive as actually being case-sensitive + // based on not participating in case conversion. And then for ones that are case-sensitive, try to + // get the chars that make up the set, if there are few enough. + Span scratch = stackalloc char[5]; // max optimized by IndexOfAny today + for (int i = 0; i < results.Count; i++) { - switch (node.Type) + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) result = results[i]; + if (!RegexCharClass.IsNegated(result.Set)) { - case RegexNode.Atomic: - case RegexNode.Capture: - case RegexNode.Concatenate: - node = node.Child(0); - break; + int count = RegexCharClass.GetSetChars(result.Set, scratch); + if (count != 0) + { + if (result.CaseInsensitive && !RegexCharClass.ParticipatesInCaseConversion(scratch.Slice(0, count))) + { + result.CaseInsensitive = false; + } - default: - return null; + if (!result.CaseInsensitive) + { + result.Chars = scratch.Slice(0, count).ToArray(); + } + + results[i] = result; + } } } - Debug.Assert(node.Type == RegexNode.Alternate); - // Create RegexCharClasses to store the built-up sets. We may end up returning fewer - // than this if we find we can't easily fill this number of sets with 100% confidence. - var classes = new RegexCharClass?[maxChars]; - bool caseInsensitive = false; + // Finally, try to move the "best" results to be earlier. "best" here are ones we're able to search + // for the fastest and that have the best chance of matching as few false positives as possible. + results.Sort((s1, s2) => + { + if (s1.CaseInsensitive != s2.CaseInsensitive) + { + // If their case-sensitivities don't match, whichever is case-sensitive comes first / is considered lower. + return s1.CaseInsensitive ? 1 : -1; + } + + if (s1.Chars is not null && s2.Chars is not null) + { + // Then of the ones that are the same length, prefer those with less frequent values. The frequency is + // only an approximation, used as a tie-breaker when we'd otherwise effectively be picking randomly. True + // frequencies will vary widely based on the actual data being searched, the language of the data, etc. + int c = SumFrequencies(s1.Chars).CompareTo(SumFrequencies(s2.Chars)); + if (c != 0) + { + return c; + } - int branches = node.ChildCount(); - Debug.Assert(branches >= 2); - for (int branchNum = 0; branchNum < branches; branchNum++) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static float SumFrequencies(char[] chars) + { + float sum = 0; + foreach (char c in chars) + { + // Lookup each character in the table. For values > 255, this will end up truncating + // and thus we'll get skew in the data. It's already a gross approximation, though, + // and it is primarily meant for disambiguation of ASCII letters. + sum += s_frequency[(byte)c]; + } + return sum; + } + } + else if (s1.Chars is not null) + { + // If s1 has chars and s2 doesn't, then s1 has fewer chars. + return -1; + } + else if (s2.Chars is not null) + { + // If s2 has chars and s1 doesn't, then s2 has fewer chars. + return 1; + } + + return s1.Distance.CompareTo(s2.Distance); + }); + + return results; + + // Starting from the specified root node, populates results with any characters at a fixed distance + // from the node's starting position. The function returns true if the entire contents of the node + // is at a fixed distance, in which case distance will have been updated to include the full length + // of the node. If it returns false, the node isn't entirely fixed, in which case subsequent nodes + // shouldn't be examined and distance should no longer be trusted. However, regardless of whether it + // returns true or false, it may have populated results, and all populated results are valid. + static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> results, ref int distance, CultureInfo culture, bool thorough) { - RegexNode alternateBranch = node.Child(branchNum); - caseInsensitive |= (alternateBranch.Options & RegexOptions.IgnoreCase) != 0; + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return false; + } - switch (alternateBranch.Type) + if ((node.Options & RegexOptions.RightToLeft) != 0) { + return false; + } + + bool caseInsensitive = (node.Options & RegexOptions.IgnoreCase) != 0; + + switch (node.Type) + { + case RegexNode.One: + if (results.Count < MaxFixedResults) + { + string setString = RegexCharClass.OneToStringClass(node.Ch, caseInsensitive ? culture : null, out bool resultIsCaseInsensitive); + results.Add((null, setString, distance++, resultIsCaseInsensitive)); + return true; + } + return false; + + case RegexNode.Onelazy or RegexNode.Oneloop or RegexNode.Oneloopatomic when node.M > 0: + { + string setString = RegexCharClass.OneToStringClass(node.Ch, caseInsensitive ? culture : null, out bool resultIsCaseInsensitive); + int minIterations = Math.Min(node.M, MaxLoopExpansion); + int i = 0; + for (; i < minIterations && results.Count < MaxFixedResults; i++) + { + results.Add((null, setString, distance++, resultIsCaseInsensitive)); + } + return i == node.M && i == node.N; + } + case RegexNode.Multi: - maxChars = Math.Min(maxChars, alternateBranch.Str!.Length); - for (int i = 0; i < maxChars; i++) { - (classes[i] ??= new RegexCharClass()).AddChar(alternateBranch.Str[i]); + string s = node.Str!; + int i = 0; + for (; i < s.Length && results.Count < MaxFixedResults; i++) + { + string setString = RegexCharClass.OneToStringClass(s[i], caseInsensitive ? culture : null, out bool resultIsCaseInsensitive); + results.Add((null, setString, distance++, resultIsCaseInsensitive)); + } + return i == s.Length; } - continue; + + case RegexNode.Set: + if (results.Count < MaxFixedResults) + { + results.Add((null, node.Str!, distance++, caseInsensitive)); + return true; + } + return false; + + case RegexNode.Setlazy or RegexNode.Setloop or RegexNode.Setloopatomic when node.M > 0: + { + int minIterations = Math.Min(node.M, MaxLoopExpansion); + int i = 0; + for (; i < minIterations && results.Count < MaxFixedResults; i++) + { + results.Add((null, node.Str!, distance++, caseInsensitive)); + } + return i == node.M && i == node.N; + } + + case RegexNode.Notone: + // We could create a set out of Notone, but it will be of little value in helping to improve + // the speed of finding the first place to match, as almost every character will match it. + distance++; + return true; + + case RegexNode.Notonelazy or RegexNode.Notoneloop or RegexNode.Notoneloopatomic when node.M == node.N: + distance += node.M; + return true; + + case RegexNode.Beginning: + case RegexNode.Bol: + case RegexNode.Boundary: + case RegexNode.ECMABoundary: + case RegexNode.Empty: + case RegexNode.End: + case RegexNode.EndZ: + case RegexNode.Eol: + case RegexNode.NonBoundary: + case RegexNode.NonECMABoundary: + case RegexNode.UpdateBumpalong: + case RegexNode.Start: + case RegexNode.Prevent: + case RegexNode.Require: + // Zero-width anchors and assertions. In theory for Prevent and Require we could also investigate + // them and use the learned knowledge to impact the generated sets, at least for lookaheads. + // For now, we don't bother. + return true; + + case RegexNode.Atomic: + case RegexNode.Group: + case RegexNode.Capture: + return TryFindFixedSets(node.Child(0), results, ref distance, culture, thorough); + + case RegexNode.Lazyloop or RegexNode.Loop when node.M > 0: + // This effectively only iterates the loop once. If deemed valuable, + // it could be updated in the future to duplicate the found results + // (updated to incorporate distance from previous iterations) and + // summed distance for all node.M iterations. If node.M == node.N, + // this would then also allow continued evaluation of the rest of the + // expression after the loop. + TryFindFixedSets(node.Child(0), results, ref distance, culture, thorough); + return false; case RegexNode.Concatenate: { - int classPos = 0; - int concatChildren = alternateBranch.ChildCount(); - for (int i = 0; i < concatChildren && classPos < classes.Length; i++) + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) { - RegexNode concatChild = alternateBranch.Child(i); - caseInsensitive |= (concatChild.Options & RegexOptions.IgnoreCase) != 0; + if (!TryFindFixedSets(node.Child(i), results, ref distance, culture, thorough)) + { + return false; + } + } + return true; + } - switch (concatChild.Type) + case RegexNode.Alternate when thorough: + { + int childCount = node.ChildCount(); + bool allSameSize = true; + int? sameDistance = null; + var combined = new Dictionary(); + + var localResults = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>(); + for (int i = 0; i < childCount; i++) + { + localResults.Clear(); + int localDistance = 0; + allSameSize &= TryFindFixedSets(node.Child(i), localResults, ref localDistance, culture, thorough); + + if (localResults.Count == 0) { - case RegexNode.One: - (classes[classPos++] ??= new RegexCharClass()).AddChar(concatChild.Ch); - break; - case RegexNode.Set: - if (!(classes[classPos++] ??= new RegexCharClass()).TryAddCharClass(RegexCharClass.Parse(concatChild.Str!))) - { - // If the classes can't be merged, give up. - return null; - } - break; - case RegexNode.Multi: - for (int c = 0; c < concatChild.Str!.Length && classPos < classes.Length; c++) + return false; + } + + if (allSameSize) + { + if (sameDistance is null) + { + sameDistance = localDistance; + } + else if (sameDistance.Value != localDistance) + { + allSameSize = false; + } + } + + foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) fixedSet in localResults) + { + if (combined.TryGetValue(fixedSet.Distance, out (RegexCharClass Set, bool CaseInsensitive, int Count) value)) + { + if (fixedSet.CaseInsensitive == value.CaseInsensitive && + value.Set.TryAddCharClass(RegexCharClass.Parse(fixedSet.Set))) { - (classes[classPos++] ??= new RegexCharClass()).AddChar(concatChild.Str[c]); + value.Count++; + combined[fixedSet.Distance] = value; } - break; + } + else + { + combined[fixedSet.Distance] = (RegexCharClass.Parse(fixedSet.Set), fixedSet.CaseInsensitive, 1); + } + } + } + + foreach (KeyValuePair pair in combined) + { + if (results.Count >= MaxFixedResults) + { + allSameSize = false; + break; + } - default: // nothing else supported - i = concatChildren; // stop looking at additional nodes - break; + if (pair.Value.Count == childCount) + { + results.Add((null, pair.Value.Set.ToStringClass(), pair.Key + distance, pair.Value.CaseInsensitive)); } } - maxChars = Math.Min(maxChars, classPos); + if (allSameSize) + { + Debug.Assert(sameDistance.HasValue); + distance += sameDistance.Value; + return true; + } + + return false; } - continue; default: - // Any other node type as a branch in the alternation and we give up. Note that we don't special-case One/Notone/Set - // because that would mean the whole branch was a single char, in which case this computation provides - // zero benefit over the ComputeFirstCharClass computation. - return null; + return false; } } + } - // We've now examined all of the alternate branches and were able to successfully process them. - // Determine how many we can actually return. - for (int i = 0; i < maxChars; i++) - { - if (classes[i] is null) - { - maxChars = i; - break; - } - } + // Computes a character class for the first character in tree. This uses a more robust algorithm + // than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example, + // fixed literals won't find the starting set for a*b, as the a isn't guaranteed and the b is at a + // variable position, but this will find [ab] as it's instead looking for anything that under any + // circumstance could possibly start a match. + public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexTree tree, CultureInfo culture) + { + var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]); + RegexFC? fc = s.RegexFCFromRegexTree(tree); + s.Dispose(); - // Make sure we got something. - if (maxChars == 0) + if (fc == null || fc._nullable) { return null; } - // Create and return the RegexPrefix objects. - var prefixes = new (string CharClass, bool CaseInsensitive)[maxChars]; - - CultureInfo? ci = null; - if (caseInsensitive) - { - ci = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - } - - for (int i = 0; i < prefixes.Length; i++) + if (fc.CaseInsensitive) { - if (caseInsensitive) - { - classes[i]!.AddLowercase(ci!); - } - prefixes[i] = (classes[i]!.ToStringClass(), caseInsensitive); + fc.AddLowercase(culture); } - return prefixes; + return (fc.GetFirstChars(), fc.CaseInsensitive); } /// Takes a RegexTree and computes the leading anchor that it encounters. @@ -619,6 +883,84 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex) throw new ArgumentException(SR.Format(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture))); } } + + /// Percent occurrences in source text (100 * char count / total count). + private static readonly float[] s_frequency = new float[] + { + 0.000f /* '\x00' */, 0.000f /* '\x01' */, 0.000f /* '\x02' */, 0.000f /* '\x03' */, 0.000f /* '\x04' */, 0.000f /* '\x05' */, 0.000f /* '\x06' */, 0.000f /* '\x07' */, + 0.000f /* '\x08' */, 0.001f /* '\x09' */, 0.000f /* '\x0A' */, 0.000f /* '\x0B' */, 0.000f /* '\x0C' */, 0.000f /* '\x0D' */, 0.000f /* '\x0E' */, 0.000f /* '\x0F' */, + 0.000f /* '\x10' */, 0.000f /* '\x11' */, 0.000f /* '\x12' */, 0.000f /* '\x13' */, 0.003f /* '\x14' */, 0.000f /* '\x15' */, 0.000f /* '\x16' */, 0.000f /* '\x17' */, + 0.000f /* '\x18' */, 0.004f /* '\x19' */, 0.000f /* '\x1A' */, 0.000f /* '\x1B' */, 0.006f /* '\x1C' */, 0.006f /* '\x1D' */, 0.000f /* '\x1E' */, 0.000f /* '\x1F' */, + 8.952f /* ' ' */, 0.065f /* ' !' */, 0.420f /* ' "' */, 0.010f /* ' #' */, 0.011f /* ' $' */, 0.005f /* ' %' */, 0.070f /* ' &' */, 0.050f /* ' '' */, + 3.911f /* ' (' */, 3.910f /* ' )' */, 0.356f /* ' *' */, 2.775f /* ' +' */, 1.411f /* ' ,' */, 0.173f /* ' -' */, 2.054f /* ' .' */, 0.677f /* ' /' */, + 1.199f /* ' 0' */, 0.870f /* ' 1' */, 0.729f /* ' 2' */, 0.491f /* ' 3' */, 0.335f /* ' 4' */, 0.269f /* ' 5' */, 0.435f /* ' 6' */, 0.240f /* ' 7' */, + 0.234f /* ' 8' */, 0.196f /* ' 9' */, 0.144f /* ' :' */, 0.983f /* ' ;' */, 0.357f /* ' <' */, 0.661f /* ' =' */, 0.371f /* ' >' */, 0.088f /* ' ?' */, + 0.007f /* ' @' */, 0.763f /* ' A' */, 0.229f /* ' B' */, 0.551f /* ' C' */, 0.306f /* ' D' */, 0.449f /* ' E' */, 0.337f /* ' F' */, 0.162f /* ' G' */, + 0.131f /* ' H' */, 0.489f /* ' I' */, 0.031f /* ' J' */, 0.035f /* ' K' */, 0.301f /* ' L' */, 0.205f /* ' M' */, 0.253f /* ' N' */, 0.228f /* ' O' */, + 0.288f /* ' P' */, 0.034f /* ' Q' */, 0.380f /* ' R' */, 0.730f /* ' S' */, 0.675f /* ' T' */, 0.265f /* ' U' */, 0.309f /* ' V' */, 0.137f /* ' W' */, + 0.084f /* ' X' */, 0.023f /* ' Y' */, 0.023f /* ' Z' */, 0.591f /* ' [' */, 0.085f /* ' \' */, 0.590f /* ' ]' */, 0.013f /* ' ^' */, 0.797f /* ' _' */, + 0.001f /* ' `' */, 4.596f /* ' a' */, 1.296f /* ' b' */, 2.081f /* ' c' */, 2.005f /* ' d' */, 6.903f /* ' e' */, 1.494f /* ' f' */, 1.019f /* ' g' */, + 1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */, + 1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */, + 0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */, + 0.000f /* '\x80' */, 0.000f /* '\x81' */, 0.000f /* '\x82' */, 0.000f /* '\x83' */, 0.000f /* '\x84' */, 0.000f /* '\x85' */, 0.000f /* '\x86' */, 0.000f /* '\x87' */, + 0.000f /* '\x88' */, 0.000f /* '\x89' */, 0.000f /* '\x8A' */, 0.000f /* '\x8B' */, 0.000f /* '\x8C' */, 0.000f /* '\x8D' */, 0.000f /* '\x8E' */, 0.000f /* '\x8F' */, + 0.000f /* '\x90' */, 0.000f /* '\x91' */, 0.000f /* '\x92' */, 0.000f /* '\x93' */, 0.000f /* '\x94' */, 0.000f /* '\x95' */, 0.000f /* '\x96' */, 0.000f /* '\x97' */, + 0.000f /* '\x98' */, 0.000f /* '\x99' */, 0.000f /* '\x9A' */, 0.000f /* '\x9B' */, 0.000f /* '\x9C' */, 0.000f /* '\x9D' */, 0.000f /* '\x9E' */, 0.000f /* '\x9F' */, + 0.000f /* '\xA0' */, 0.000f /* '\xA1' */, 0.000f /* '\xA2' */, 0.000f /* '\xA3' */, 0.000f /* '\xA4' */, 0.000f /* '\xA5' */, 0.000f /* '\xA6' */, 0.000f /* '\xA7' */, + 0.000f /* '\xA8' */, 0.000f /* '\xA9' */, 0.000f /* '\xAA' */, 0.000f /* '\xAB' */, 0.000f /* '\xAC' */, 0.000f /* '\xAD' */, 0.000f /* '\xAE' */, 0.000f /* '\xAF' */, + 0.000f /* '\xB0' */, 0.000f /* '\xB1' */, 0.000f /* '\xB2' */, 0.000f /* '\xB3' */, 0.000f /* '\xB4' */, 0.000f /* '\xB5' */, 0.000f /* '\xB6' */, 0.000f /* '\xB7' */, + 0.000f /* '\xB8' */, 0.000f /* '\xB9' */, 0.000f /* '\xBA' */, 0.000f /* '\xBB' */, 0.000f /* '\xBC' */, 0.000f /* '\xBD' */, 0.000f /* '\xBE' */, 0.000f /* '\xBF' */, + 0.000f /* '\xC0' */, 0.000f /* '\xC1' */, 0.000f /* '\xC2' */, 0.000f /* '\xC3' */, 0.000f /* '\xC4' */, 0.000f /* '\xC5' */, 0.000f /* '\xC6' */, 0.000f /* '\xC7' */, + 0.000f /* '\xC8' */, 0.000f /* '\xC9' */, 0.000f /* '\xCA' */, 0.000f /* '\xCB' */, 0.000f /* '\xCC' */, 0.000f /* '\xCD' */, 0.000f /* '\xCE' */, 0.000f /* '\xCF' */, + 0.000f /* '\xD0' */, 0.000f /* '\xD1' */, 0.000f /* '\xD2' */, 0.000f /* '\xD3' */, 0.000f /* '\xD4' */, 0.000f /* '\xD5' */, 0.000f /* '\xD6' */, 0.000f /* '\xD7' */, + 0.000f /* '\xD8' */, 0.000f /* '\xD9' */, 0.000f /* '\xDA' */, 0.000f /* '\xDB' */, 0.000f /* '\xDC' */, 0.000f /* '\xDD' */, 0.000f /* '\xDE' */, 0.000f /* '\xDF' */, + 0.000f /* '\xE0' */, 0.000f /* '\xE1' */, 0.000f /* '\xE2' */, 0.000f /* '\xE3' */, 0.000f /* '\xE4' */, 0.000f /* '\xE5' */, 0.000f /* '\xE6' */, 0.000f /* '\xE7' */, + 0.000f /* '\xE8' */, 0.000f /* '\xE9' */, 0.000f /* '\xEA' */, 0.000f /* '\xEB' */, 0.000f /* '\xEC' */, 0.000f /* '\xED' */, 0.000f /* '\xEE' */, 0.000f /* '\xEF' */, + 0.000f /* '\xF0' */, 0.000f /* '\xF1' */, 0.000f /* '\xF2' */, 0.000f /* '\xF3' */, 0.000f /* '\xF4' */, 0.000f /* '\xF5' */, 0.000f /* '\xF6' */, 0.000f /* '\xF7' */, + 0.000f /* '\xF8' */, 0.000f /* '\xF9' */, 0.000f /* '\xFA' */, 0.000f /* '\xFB' */, 0.000f /* '\xFC' */, 0.000f /* '\xFD' */, 0.000f /* '\xFE' */, 0.000f /* '\xFF' */, + }; + + // The above table was generated programmatically with the following. This can be augmented to incorporate additional data sources, + // though it is only intended to be a rough approximation use when tie-breaking and we'd otherwise be picking randomly, so, it's something. + // The frequencies may be wildly inaccurate when used with data sources different in nature than the training set, in which case we shouldn't + // be much worse off than just picking randomly: + // + // using System.Runtime.InteropServices; + // + // var counts = new Dictionary(); + // + // (string, string)[] rootsAndExtensions = new[] + // { + // (@"d:\repos\runtime\src\", "*.cs"), // C# files in dotnet/runtime + // (@"d:\Top25GutenbergBooks", "*.txt"), // Top 25 most popular books on Project Gutenberg + // }; + // + // foreach ((string root, string ext) in rootsAndExtensions) + // foreach (string path in Directory.EnumerateFiles(root, ext, SearchOption.AllDirectories)) + // foreach (string line in File.ReadLines(path)) + // foreach (char c in line.AsSpan().Trim()) + // CollectionsMarshal.GetValueRefOrAddDefault(counts, (byte)c, out _)++; + // + // long total = counts.Sum(i => i.Value); + // + // Console.WriteLine("/// Percent occurrences in source text (100 * char count / total count)."); + // Console.WriteLine("private static readonly float[] s_frequency = new float[]"); + // Console.WriteLine("{"); + // int i = 0; + // for (int row = 0; row < 32; row++) + // { + // Console.Write(" "); + // for (int col = 0; col < 8; col++) + // { + // counts.TryGetValue((byte)i, out long charCount); + // float frequency = (float)(charCount / (double)total) * 100; + // Console.Write($" {frequency:N3}f /* '{(i >= 32 && i < 127 ? $" {(char)i}" : $"\\x{i:X2}")}' */,"); + // i++; + // } + // Console.WriteLine(); + // } + // Console.WriteLine("};"); } internal sealed class RegexFC diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 2154947cfaa8d..93420b2381381 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -4,6 +4,7 @@ using System.Collections; using System.Collections.Generic; using System.Globalization; +using System.Runtime.InteropServices; namespace System.Text.RegularExpressions { @@ -38,10 +39,10 @@ private RegexWriter(Span emittedSpan, Span intStackSpan) /// This is the only function that should be called from outside. /// It takes a RegexTree and creates a corresponding RegexCode. /// - public static RegexCode Write(RegexTree tree) + public static RegexCode Write(RegexTree tree, CultureInfo culture) { var writer = new RegexWriter(stackalloc int[EmittedSize], stackalloc int[IntStackSize]); - RegexCode code = writer.RegexCodeFromRegexTree(tree); + RegexCode code = writer.RegexCodeFromRegexTree(tree, culture); writer.Dispose(); #if DEBUG @@ -71,7 +72,7 @@ public void Dispose() /// It also computes various information about the tree, such as /// prefix data to help with optimizations. /// - public RegexCode RegexCodeFromRegexTree(RegexTree tree) + public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture) { // Construct sparse capnum mapping if some numbers are unused. int capsize; @@ -131,46 +132,6 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree) Emit(RegexCode.Stop); int[] emitted = _emitted.AsSpan().ToArray(); - bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0; - bool compiled = (tree.Options & RegexOptions.Compiled) != 0; - - // Compute prefixes to help optimize FindFirstChar. - RegexBoyerMoore? boyerMoorePrefix = null; - (string CharClass, bool CaseInsensitive)[]? leadingCharClasses = null; - (string leadingSubstring, bool leadingSubstringCI) = RegexPrefixAnalyzer.ComputeLeadingSubstring(tree); - if (leadingSubstring.Length > 1 && // if it's <= 1, perf is better using leadingCharClasses - leadingSubstring.Length <= RegexBoyerMoore.MaxLimit) - { - // Compute a Boyer-Moore prefix if we find a single string of sufficient length that always begins the expression. - CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - boyerMoorePrefix = new RegexBoyerMoore(leadingSubstring, leadingSubstringCI, rtl, culture); - } - - // If we didn't find a single leading substring, or if we found one but we won't be able to use it for a Boyer-Moore - // search, try to compute the characters set that might begin the string. - if (boyerMoorePrefix is null || - (boyerMoorePrefix.NegativeUnicode != null && compiled)) // compilation won't use Boyer-Moore if it has a negative Unicode table - { - boyerMoorePrefix = null; - - // First we employ a less aggressive but more valuable computation to see if we can find sets for each of the first N - // characters in the string. If that's unsuccessful, we employ a more aggressive check to compute a set for just - // the first character in the string. - - if ((tree.Options & RegexOptions.Compiled) != 0) // currently not utilized by the interpreter - { - leadingCharClasses = RegexPrefixAnalyzer.ComputeMultipleCharClasses(tree, maxChars: 5); // limit of 5 is based on experimentation and can be tweaked as needed - } - - if (leadingCharClasses is null) - { - leadingCharClasses = RegexPrefixAnalyzer.ComputeFirstCharClass(tree); - } - } - - // Compute any anchors starting the expression. - int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree); - // Convert the string table into an ordered string array. var strings = new string[_stringTable.Count]; foreach (KeyValuePair stringEntry in _stringTable) @@ -179,7 +140,7 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree) } // Return all that in a RegexCode object. - return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl); + return new RegexCode(tree, culture, emitted, strings, _trackCount, _caps, capsize); } /// @@ -233,16 +194,23 @@ private void Emit(int op, int opd1, int opd2) /// /// Returns an index in the string table for a string; - /// uses a hashtable to eliminate duplicates. + /// uses a dictionary to eliminate duplicates. /// private int StringCode(string str) { +#if REGEXGENERATOR if (!_stringTable.TryGetValue(str, out int i)) { i = _stringTable.Count; _stringTable.Add(str, i); } - +#else + ref int i = ref CollectionsMarshal.GetValueRefOrAddDefault(_stringTable, str, out bool exists); + if (!exists) + { + i = _stringTable.Count - 1; + } +#endif return i; } @@ -265,7 +233,7 @@ private int MapCapnum(int capnum) => private void EmitFragment(int nodetype, RegexNode node, int curIndex) { int bits = 0; - if (node.UseOptionR()) + if ((node.Options & RegexOptions.RightToLeft) != 0) { bits |= RegexCode.Rtl; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs index a5f79f27ceaac..5ffe1da8ce163 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs @@ -240,6 +240,7 @@ public BDD[] TopologicalSort() /// Serializer uses more compacted representations when fewer bits are needed, which is reflected in the first /// two numbers of the return value. MTBDD terminals are represented by negated numbers as -id. /// + [ExcludeFromCodeCoverage] public long[] Serialize() { if (IsEmpty) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index bf75d21a05fcf..1fec095c54b10 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -3,7 +3,6 @@ using System.Collections.Generic; using System.Diagnostics; -using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions.Symbolic @@ -406,7 +405,6 @@ private DfaMatchingState MakeNewState(DfaMatchingState state lock (this) { state.Id = _stateCache.Count; - int k = state.GetHashCode(); _stateCache.Add(state); Debug.Assert(_statearray is not null); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index 3bd803e47f338..ba522d513392c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -19,28 +19,6 @@ internal readonly struct SymbolicRegexInfo private SymbolicRegexInfo(uint i) => _info = i; - /// Optimized lookup array for most common combinations. - /// Most common cases will be 0 (no anchors and not nullable) and 1 (no anchors and nullable) - private static readonly SymbolicRegexInfo[] s_infos = CreateSymbolicRegexInfos(); - - private static SymbolicRegexInfo[] CreateSymbolicRegexInfos() - { - var infos = new SymbolicRegexInfo[128]; - for (uint i = 0; i < infos.Length; i++) - { - infos[i] = new SymbolicRegexInfo(i); - } - return infos; - } - - private static SymbolicRegexInfo Mk(uint i) - { - SymbolicRegexInfo[] infos = s_infos; - return i < infos.Length ? - infos[i] : - new SymbolicRegexInfo(i); - } - internal static SymbolicRegexInfo Mk(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false, bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false, bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true) @@ -87,7 +65,7 @@ internal static SymbolicRegexInfo Mk(bool isAlwaysNullable = false, bool canBeNu i |= IsLazyMask; } - return Mk(i); + return new SymbolicRegexInfo(i); } public bool IsNullable => (_info & IsAlwaysNullableMask) != 0; @@ -121,7 +99,7 @@ public static SymbolicRegexInfo Or(SymbolicRegexInfo[] infos) } i = (i & ~IsLazyMask) | isLazy; - return Mk(i); + return new SymbolicRegexInfo(i); } public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) @@ -140,7 +118,7 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) i = (i & ~IsLazyMask) | isLazy; i = (i & ~(IsAlwaysNullableMask | CanBeNullableMask)) | isNullable; - return Mk(i); + return new SymbolicRegexInfo(i); } public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) @@ -164,7 +142,10 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound uint i = body_info._info; // The loop is nullable if either the body is nullable or if the lower boud is 0 - i |= lowerBound == 0 ? (IsAlwaysNullableMask | CanBeNullableMask) : 0; + if (lowerBound == 0) + { + i |= IsAlwaysNullableMask | CanBeNullableMask; + } // The loop is lazy iff it is marked lazy if (isLazy) @@ -176,7 +157,7 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound i &= ~IsLazyMask; } - return Mk(i); + return new SymbolicRegexInfo(i); } public static SymbolicRegexInfo Not(SymbolicRegexInfo info) => diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index a8cec12036225..fb8bd13e4bec8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -11,15 +11,8 @@ namespace System.Text.RegularExpressions.Symbolic { /// Represents a regex matching engine that performs regex matching using symbolic derivatives. - internal abstract class SymbolicRegexMatcher + internal interface ISymbolicRegexMatcher { - /// Returns the next match index and length in the input string. - /// Whether to return once we know there's a match without determining where exactly it matched. - /// The input string. - /// The start position in the input. - /// The end position in the input. - public abstract SymbolicMatch FindMatch(bool isMatch, string input, int startat, int endat); - #if DEBUG /// Unwind the regex of the matcher and save the resulting state graph in DGML /// roughly the maximum number of states, 0 means no bound @@ -30,8 +23,7 @@ internal abstract class SymbolicRegexMatcher /// dgml output is written here /// maximum length of labels in nodes anything over that length is indicated with .. /// if true creates NFA instead of DFA - public abstract void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA); - + void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA); /// /// Generates up to k random strings matched by the regex @@ -40,13 +32,13 @@ internal abstract class SymbolicRegexMatcher /// random seed for the generator, 0 means no random seed /// if true then generate inputs that do not match /// - public abstract IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative); + IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative); #endif } /// Represents a regex matching engine that performs regex matching using symbolic derivatives. /// Character set type. - internal sealed class SymbolicRegexMatcher : SymbolicRegexMatcher where TSetType : notnull + internal sealed class SymbolicRegexMatcher : ISymbolicRegexMatcher where TSetType : notnull { /// Maximum number of states before switching over to Antimirov mode. /// @@ -127,54 +119,24 @@ internal sealed class SymbolicRegexMatcher : SymbolicRegexMatcher wher /// Timeout in milliseconds. This is only used if is true. private readonly int _timeout; - /// Classifier used to say whether a particular character can start a match for . - internal readonly BooleanClassifier _startSetClassifier; - - /// Predicate over characters that make some progress - private readonly TSetType _startSet; - - /// Maximum allowed size of . - private const int StartSetArrayMaxSize = 5; - - /// String of at most many characters - private readonly char[] _startSetArray; - - /// Number of elements in - private readonly int _startSetSize; - - /// If nonempty then has that fixed prefix - private readonly string _prefix; + /// Data and routines for skipping ahead to the next place a match could potentially start. + private readonly RegexFindOptimizations? _findOpts; - /// Non-null when is nonempty - private readonly RegexBoyerMoore? _prefixBoyerMoore; + /// The initial states for the original pattern, keyed off of the previous character kind. + /// If the pattern doesn't contain any anchors, there will only be a single initial state. + private readonly DfaMatchingState[] _initialStates; - /// If true then the fixed prefix of is idependent of case - private readonly bool _isPrefixCaseInsensitive; + /// The initial states for the dot-star pattern, keyed off of the previous character kind. + /// If the pattern doesn't contain any anchors, there will only be a single initial state. + private readonly DfaMatchingState[] _dotstarredInitialStates; - /// Cached skip states from the initial state of for the 5 possible previous character kinds. - private readonly DfaMatchingState?[] _prefixSkipStates = new DfaMatchingState[CharKind.CharKindCount]; - /// Cached skip states from the initial state of Ar for the 5 possible previous character kinds. - private readonly DfaMatchingState?[] _reversePrefixSkipStates = new DfaMatchingState[CharKind.CharKindCount]; + /// The initial states for the reverse pattern, keyed off of the previous character kind. + /// If the pattern doesn't contain any anchors, there will only be a single initial state. + private readonly DfaMatchingState[] _reverseInitialStates; - private readonly string _reversePrefix; - - private readonly DfaMatchingState[] _initialStates = new DfaMatchingState[CharKind.CharKindCount]; - private readonly DfaMatchingState[] _dotstarredInitialStates = new DfaMatchingState[CharKind.CharKindCount]; - private readonly DfaMatchingState[] _reverseInitialStates = new DfaMatchingState[CharKind.CharKindCount]; - - private readonly uint[] _asciiCharKinds = new uint[128]; - - internal readonly CultureInfo _culture; - - private DfaMatchingState GetSkipState(uint prevCharKind) => - Volatile.Read(ref _prefixSkipStates[prevCharKind]) ?? - Interlocked.CompareExchange(ref _prefixSkipStates[prevCharKind], DeltaPlus(_prefix, _dotstarredInitialStates[prevCharKind]), null) ?? - _prefixSkipStates[prevCharKind]!; - - private DfaMatchingState GetReverseSkipState(uint prevCharKind) => - Volatile.Read(ref _reversePrefixSkipStates[prevCharKind]) ?? - Interlocked.CompareExchange(ref _reversePrefixSkipStates[prevCharKind], DeltaPlus(_reversePrefix, _reverseInitialStates[prevCharKind]), null) ?? - _reversePrefixSkipStates[prevCharKind]!; + /// Lookup table to quickly determine the character kind for ASCII characters. + /// Non-null iff the pattern contains anchors; otherwise, it's unused. + private readonly uint[]? _asciiCharKinds; /// Get the minterm of . /// character code @@ -186,16 +148,14 @@ private TSetType GetMinterm(int c) } /// Constructs matcher for given symbolic regex. - internal SymbolicRegexMatcher(SymbolicRegexNode sr, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) + internal SymbolicRegexMatcher(SymbolicRegexNode sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) { + Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}"); + _pattern = sr; _builder = sr._builder; - _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms - _culture = culture; - - Debug.Assert(_builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {_builder._solver}"); _partitions = _builder._solver switch { BV64Algebra bv64 => bv64._classifier, @@ -203,44 +163,57 @@ internal SymbolicRegexMatcher(SymbolicRegexNode sr, CharSetSolver css, _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms), }; - _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); - _reversePattern = _pattern.Reverse(); - ConfigureRegexes(); - - _startSet = _pattern.GetStartSet(); - if (!_builder._solver.IsSatisfiable(_startSet) || _pattern.CanBeNullable) + if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && + code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. { - // If the startset is empty make it full instead by including all characters - // this is to ensure that startset is nonempty -- as an invariant assumed by operations using it - // - // Also, if A can be nullable then effectively disable use of startset by making it true - // because it may force search of next character in startset and fail to recognize an empty match - // because (by definition) an empty match has no start character. - // - // For example (this is also a unit test): - // for pattern "\B\W*?" or "\B\W*" or "\B\W?" and input "e.g:abc" there is an empty match in position 5 - // but startset \W will force search beyond position 5 and fails to find that match - _startSet = _builder._solver.True; + _findOpts = code.FindOptimizations; } - _startSetSize = (int)_builder._solver.ComputeDomainSize(_startSet); + // Determine the number of initial states. If there's no anchor, only the default previous + // character kind 0 is ever going to be used for all initial states. + int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; - BDD startbdd = _builder._solver.ConvertToCharSet(css, _startSet); - _startSetClassifier = new BooleanClassifier(css, startbdd); - - //store the start characters in the A_startset_array if there are not too many characters - _startSetArray = _startSetSize <= StartSetArrayMaxSize ? - new List(css.GenerateAllCharacters(startbdd)).ToArray() : - Array.Empty(); + // Create the initial states for the original pattern. + var initialStates = new DfaMatchingState[statesCount]; + for (uint i = 0; i < initialStates.Length; i++) + { + initialStates[i] = _builder.MkState(_pattern, i); + } + _initialStates = initialStates; - _prefix = _pattern.GetFixedPrefix(css, culture.Name, out _isPrefixCaseInsensitive); - _reversePrefix = _reversePattern.GetFixedPrefix(css, culture.Name, out _); + // Create the dot-star pattern (a concatenation of any* with the original pattern) + // and all of its initial states. + _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); + var dotstarredInitialStates = new DfaMatchingState[statesCount]; + for (uint i = 0; i < dotstarredInitialStates.Length; i++) + { + // Used to detect if initial state was reentered, + // but observe that the behavior from the state may ultimately depend on the previous + // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, + // in that sense there can be several "versions" (not more than StateCount) of the initial state. + DfaMatchingState state = _builder.MkState(_dotStarredPattern, i); + state.IsInitialState = true; + dotstarredInitialStates[i] = state; + } + _dotstarredInitialStates = dotstarredInitialStates; - _prefixBoyerMoore = InitializePrefixBoyerMoore(); + // Create the reverse pattern (the original pattern in reverse order) and all of its + // initial states. + _reversePattern = _pattern.Reverse(); + var reverseInitialStates = new DfaMatchingState[statesCount]; + for (uint i = 0; i < reverseInitialStates.Length; i++) + { + reverseInitialStates[i] = _builder.MkState(_reversePattern, i); + } + _reverseInitialStates = reverseInitialStates; + // Initialize our fast-lookup for determining the character kind of ASCII characters. + // This is only required when the pattern contains anchors, as otherwise there's only + // ever a single kind used. if (_pattern._info.ContainsSomeAnchor) { - for (int i = 0; i < 128; i++) + var asciiCharKinds = new uint[128]; + for (int i = 0; i < asciiCharKinds.Length; i++) { TSetType predicate2; uint charKind; @@ -256,68 +229,12 @@ internal SymbolicRegexMatcher(SymbolicRegexNode sr, CharSetSolver css, charKind = CharKind.WordLetter; } - _asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; + asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; } + _asciiCharKinds = asciiCharKinds; } } - private RegexBoyerMoore? InitializePrefixBoyerMoore() - { - if (_prefix != string.Empty && _prefix.Length <= RegexBoyerMoore.MaxLimit && _prefix.Length > 1) - { - // RegexBoyerMoore expects the prefix to be lower case when case is ignored. - // Use the culture of the matcher. - string prefix = _isPrefixCaseInsensitive ? _prefix.ToLower(_culture) : _prefix; - return new RegexBoyerMoore(prefix, _isPrefixCaseInsensitive, rightToLeft: false, _culture); - } - - return null; - } - - private void ConfigureRegexes() - { - void Configure(uint i) - { - _initialStates[i] = _builder.MkState(_pattern, i); - - // Used to detect if initial state was reentered, then startset can be triggered - // but observe that the behavior from the state may ultimately depend on the previous - // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, - // in that sense there can be several "versions" (not more than StateCount) of the initial state. - _dotstarredInitialStates[i] = _builder.MkState(_dotStarredPattern, i); - _dotstarredInitialStates[i].IsInitialState = true; - - _reverseInitialStates[i] = _builder.MkState(_reversePattern, i); - } - - // Create initial states for A, A1 and Ar. - if (!_pattern._info.ContainsSomeAnchor) - { - // Only the default previous character kind 0 is ever going to be used for all initial states. - // _A1q0[0] is recognized as special initial state. - // This information is used for search optimization based on start set and prefix of A. - Configure(0); - } - else - { - for (uint i = 0; i < CharKind.CharKindCount; i++) - { - Configure(i); - } - } - } - - /// Return the state after the given string from the given state . - private DfaMatchingState DeltaPlus(string pattern, DfaMatchingState state) where TTransition : struct, ITransition - { - for (int i = 0; i < pattern.Length; i++) - { - state = Delta(pattern, i, state); - } - - return state; - } - /// Interface for transitions used by the method. private interface ITransition { @@ -428,30 +345,21 @@ private DfaMatchingState CreateNewTransition(DfaMatchingState timeoutOccursAt && 0 < currentMillis) - return; - - //regex pattern is in general not available in srm and - //the input is not available here but could be passed as argument to DoCheckTimeout - throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout)); + if (currentMillis >= timeoutOccursAt && (0 <= timeoutOccursAt || 0 >= currentMillis)) + { + throw new RegexMatchTimeoutException(string.Empty, string.Empty, TimeSpan.FromMilliseconds(_timeout)); + } } /// Find a match. /// Whether to return once we know there's a match without determining where exactly it matched. - /// input string - /// the position to start search in the input string - /// the next position after the end position in the input - public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, int k) + /// The input string + /// The position to start search in the input string. + /// The non-inclusive position to end the search in the input string. + public SymbolicMatch FindMatch(bool isMatch, string input, int startat, int end) { int timeoutOccursAt = 0; if (_checkTimeout) @@ -460,18 +368,16 @@ public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, timeoutOccursAt = Environment.TickCount + (int)(_timeout + 0.5); } - if (startat == k) + if (startat == end) { - //covers the special case when the remaining input suffix - //where a match is sought is empty (for example when the input is empty) - //in this case the only possible match is an empty match + // Covers the special-case of an empty match at the end of the input. uint prevKind = GetCharKind(input, startat - 1); uint nextKind = GetCharKind(input, startat); bool emptyMatchExists = _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind)); - return - !emptyMatchExists ? SymbolicMatch.NoMatch : - new SymbolicMatch(startat, 0); + return emptyMatchExists ? + new SymbolicMatch(startat, 0) : + SymbolicMatch.NoMatch; } // Find the first accepting state. Initial start position in the input is i == 0. @@ -479,7 +385,7 @@ public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, // May return -1 as a legitimate value when the initial state is nullable and startat == 0. // Returns NoMatchExists when there is no match. - i = FindFinalStatePosition(input, k, i, timeoutOccursAt, out int i_q0_A1, out int watchdog); + i = FindFinalStatePosition(input, end, i, timeoutOccursAt, out int i_q0_A1, out int watchdog); if (i == NoMatchExists) { @@ -502,24 +408,17 @@ public override SymbolicMatch FindMatch(bool isMatch, string input, int startat, } else { - if (i < startat) - { - Debug.Assert(i == startat - 1); - i_start = startat; - } - else - { - // Walk in reverse to locate the start position of the match - i_start = FindStartPosition(input, i, i_q0_A1); - } - - i_end = FindEndPosition(input, k, i_start); + Debug.Assert(i >= startat - 1); + i_start = i < startat ? + startat : + FindStartPosition(input, i, i_q0_A1); // Walk in reverse to locate the start position of the match + i_end = FindEndPosition(input, end, i_start); } return new SymbolicMatch(i_start, i_end + 1 - i_start); } - /// Find match end position using A, end position is known to exist. + /// Find match end position using the original pattern, end position is known to exist. /// input array /// inclusive start position /// exclusive end position @@ -561,7 +460,7 @@ private int FindEndPosition(string input, int exclusiveEnd, int i) return i_end; } - // Inner loop for FindEndPosition parameterized by an ITransition type. + /// Inner loop for FindEndPosition parameterized by an ITransition type. [MethodImpl(MethodImplOptions.AggressiveInlining)] private bool FindEndPositionDeltas(string input, ref int i, int j, ref DfaMatchingState q, ref int i_end) where TTransition : struct, ITransition { @@ -582,7 +481,7 @@ private bool FindEndPositionDeltas(string input, ref int i, int j, } else if (q.IsDeadend) { - // Nonaccepting sink state (deadend) has been reached in A. + // Non-accepting sink state (deadend) has been reached in the original pattern. // So the match ended when the last i_end was updated. return true; } @@ -594,26 +493,18 @@ private bool FindEndPositionDeltas(string input, ref int i, int j, return false; } - /// Walk back in reverse using Ar to find the start position of match, start position is known to exist. + /// Walk back in reverse using the reverse pattern to find the start position of match, start position is known to exist. /// the input string /// position to start walking back from, i points at the last character of the match /// do not pass this boundary when walking back /// private int FindStartPosition(string input, int i, int match_start_boundary) { - // Fetch the correct start state for Ar. + // Fetch the correct start state for the reverse pattern. // This depends on previous character --- which, because going backwards, is character number i+1. uint prevKind = GetCharKind(input, i + 1); DfaMatchingState q = _reverseInitialStates[prevKind]; - // Ar may have a fixed prefix sequence - if (_reversePrefix.Length > 0) - { - //skip past the prefix portion of Ar - q = GetReverseSkipState(prevKind); - i -= _reversePrefix.Length; - } - if (i == -1) { Debug.Assert(q.IsNullable(GetCharKind(input, i)), "we reached the beginning of the input, thus the state q must be accepting"); @@ -623,12 +514,12 @@ private int FindStartPosition(string input, int i, int match_start_boundary) int last_start = -1; if (q.IsNullable(GetCharKind(input, i))) { - // The whole prefix of Ar was in reverse a prefix of A, - // for example when the pattern of A is concrete word such as "abc" + // The whole prefix of the reverse pattern was in reverse a prefix of the original pattern, + // for example when the original pattern is concrete word such as "abc" last_start = i + 1; } - //walk back to the accepting state of Ar + // Walk back to the accepting state of the reverse pattern while (i >= match_start_boundary) { int j = Math.Max(match_start_boundary, i - AntimirovThresholdLeeway); @@ -663,7 +554,7 @@ private bool FindStartPositionDeltas(string input, ref int i, int j if (q.IsNullable(GetCharKind(input, i - 1))) { // Earliest start point so far. This must happen at some point - // or else A1 would not have reached a final state after match_start_boundary. + // or else the dot-star pattern would not have reached a final state after match_start_boundary. last_start = i; } @@ -683,7 +574,7 @@ private bool FindStartPositionDeltas(string input, ref int i, int j /// length of match when positive private int FindFinalStatePosition(string input, int k, int i, int timeoutOccursAt, out int initialStateIndex, out int watchdog) { - // Get the correct start state of A1, which in general depends on the previous character kind in the input. + // Get the correct start state of the dot-star pattern, which in general depends on the previous character kind in the input. uint prevCharKindId = GetCharKind(input, i - 1); DfaMatchingState q = _dotstarredInitialStates[prevCharKindId]; initialStateIndex = i; @@ -712,53 +603,13 @@ private int FindFinalStatePosition(string input, int k, int i, int timeoutOccurs { if (q.IsInitialState) { - // i_q0_A1 is the most recent position in the input when A1 is in the initial state + // i_q0_A1 is the most recent position in the input when the dot-star pattern is in the initial state initialStateIndex = i; - if (_prefixBoyerMoore != null) + if (_findOpts is RegexFindOptimizations findOpts) { - // Stay in the initial state if the prefix does not match. - // Thus advance the current position to the first position where the prefix does match. - i = _prefixBoyerMoore.Scan(input, i, 0, input.Length); - - if (i == -1) // Scan returns -1 when a matching position does not exist - { - watchdog = -1; - return -2; - } - - // Compute the end state for the A prefix. - // Skip directly to the resulting state - // --- i.e. do the loop --- - // for (int j = 0; j < prefix.Length; j++) - // q = Delta(prefix[j], q, out regex); - // --- - q = GetSkipState(q.PrevCharKind); - - // skip the prefix - i += _prefix.Length; - - // here i points at the next character (the character immediately following the prefix) - if (q.IsNullable(GetCharKind(input, i))) - { - // Return the last position of the match - watchdog = q.WatchDog; - return i - 1; - } - - if (i == k) - { - // no match was found - return -2; - } - } - else - { - // we are still in the initial state, when the prefix is empty - // find the first position i that matches with some character in the start set - i = IndexOfStartSet(input, i); - - if (i == -1) + // Find the first position i that matches with some likely character. + if (!findOpts.TryFindNextStartingPosition(input, ref i, 0, 0, k)) { // no match was found return NoMatchExists; @@ -833,68 +684,45 @@ private bool FindFinalStatePositionDeltas(string input, int j, ref [MethodImpl(MethodImplOptions.AggressiveInlining)] private uint GetCharKind(string input, int i) { - if (!_pattern._info.ContainsSomeAnchor) - { - // The previous character kind is irrelevant when anchors are not used. - return CharKind.General; - } - - if (i == -1 || i == input.Length) - { - return CharKind.StartStop; - } + return !_pattern._info.ContainsSomeAnchor ? + CharKind.General : // The previous character kind is irrelevant when anchors are not used. + GetCharKindWithAnchor(input, i); - char nextChar = input[i]; - if (nextChar == '\n') + uint GetCharKindWithAnchor(string input, int i) { - return - _builder._newLinePredicate.Equals(_builder._solver.False) ? 0 : // ignore \n - i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z). - CharKind.Newline; - } - - uint[] asciiCharKinds = _asciiCharKinds; - return - nextChar < asciiCharKinds.Length ? asciiCharKinds[nextChar] : - _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character - CharKind.WordLetter; - } + Debug.Assert(_asciiCharKinds is not null); - /// - /// Find first occurrence of startset element in input starting from index i. - /// Startset here is assumed to consist of a few characters. - /// - /// input string to search in - /// the start index in input to search from - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private int IndexOfStartSet(string input, int i) - { - if (_startSetSize <= StartSetArrayMaxSize) - { - return input.IndexOfAny(_startSetArray, i); - } + if ((uint)i >= input.Length) + { + return CharKind.StartStop; + } - for (int j = i; j < input.Length; j++) - { - if (_startSetClassifier.IsTrue(input[j])) + char nextChar = input[i]; + if (nextChar == '\n') { - return j; + return + _builder._newLinePredicate.Equals(_builder._solver.False) ? 0 : // ignore \n + i == 0 || i == input.Length - 1 ? CharKind.NewLineS : // very first or very last \n. Detection of very first \n is needed for rev(\Z). + CharKind.Newline; } - } - return -1; + uint[] asciiCharKinds = _asciiCharKinds; + return + nextChar < asciiCharKinds.Length ? asciiCharKinds[nextChar] : + _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character + CharKind.WordLetter; + } } #if DEBUG - public override void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA) + public void SaveDGML(TextWriter writer, int bound, bool hideStateInfo, bool addDotStar, bool inReverse, bool onlyDFAinfo, int maxLabelLength, bool asNFA) { var graph = new DGML.RegexAutomaton(this, bound, addDotStar, inReverse, asNFA); var dgml = new DGML.DgmlWriter(writer, hideStateInfo, maxLabelLength, onlyDFAinfo); dgml.Write(graph); } - public override IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) => + public IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) => new SymbolicRegexSampler(_pattern, randomseed, negative).GenerateRandomMembers(k); #endif } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index 83906ff029c1a..b0d9de74668a0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -12,10 +12,8 @@ internal sealed class SymbolicRegexRunnerFactory : RegexRunnerFactory /// The unicode component, including the BDD algebra. internal static readonly UnicodeCategoryTheory s_unicode = new UnicodeCategoryTheory(new CharSetSolver()); - /// The matching engine. - internal readonly SymbolicRegexMatcher _matcher; - /// Minimum length computed - private readonly int _minRequiredLength; + /// The matching engine, for 64 or fewer minterms. A SymbolicRegexMatcher of ulong or VB + internal readonly ISymbolicRegexMatcher _matcher; /// Initializes the factory. public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) @@ -32,23 +30,22 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan var solver = (CharSetSolver)s_unicode._solver; SymbolicRegexNode root = converter.Convert(code.Tree.Root, topLevel: true); - _minRequiredLength = code.Tree.MinRequiredLength; - BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BV to represent a predicate var algBV = new BVAlgebra(solver, minterms); - var builderBV = new SymbolicRegexBuilder(algBV); - - // The default constructor sets the following predicates to False; this update happens after the fact. - // It depends on whether anchors where used in the regex whether the predicates are actually different from False. - builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors); - builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate); + var builderBV = new SymbolicRegexBuilder(algBV) + { + // The default constructor sets the following predicates to False; this update happens after the fact. + // It depends on whether anchors where used in the regex whether the predicates are actually different from False. + _wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), + _newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate) + }; - //Convert the BDD based AST to BV based AST + // Convert the BDD-based AST to BV-based AST SymbolicRegexNode rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); - _matcher = new SymbolicRegexMatcher(rootBV, solver, minterms, matchTimeout, culture); + _matcher = new SymbolicRegexMatcher(rootBV, code, solver, minterms, matchTimeout, culture); } else { @@ -64,37 +61,31 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); - _matcher = new SymbolicRegexMatcher(root64, solver, minterms, matchTimeout, culture); + _matcher = new SymbolicRegexMatcher(root64, code, solver, minterms, matchTimeout, culture); } } /// Creates a object. - protected internal override RegexRunner CreateInstance() => new Runner(_matcher, _minRequiredLength); + protected internal override RegexRunner CreateInstance() => _matcher is SymbolicRegexMatcher srmUInt64 ? + new Runner(srmUInt64) : + new Runner((SymbolicRegexMatcher)_matcher); /// Runner type produced by this factory. /// - /// The wrapped is itself thread-safe and can be shared across + /// The wrapped is itself thread-safe and can be shared across /// all runner instances, but the runner itself has state (e.g. for captures, positions, etc.) /// and must not be shared between concurrent uses. /// - private sealed class Runner : RegexRunner + private sealed class Runner : RegexRunner where TSetType : notnull { /// The matching engine. - private readonly SymbolicRegexMatcher _matcher; - /// Minimum length computed. - private readonly int _minRequiredLength; + private readonly SymbolicRegexMatcher _matcher; - internal Runner(SymbolicRegexMatcher matcher, int minRequiredLength) - { - _matcher = matcher; - _minRequiredLength = minRequiredLength; - } + internal Runner(SymbolicRegexMatcher matcher) => _matcher = matcher; protected override void InitTrackCount() { } // nop, no backtracking - protected override bool FindFirstChar() => - // The real logic is all in Go. Here we simply validate if there's enough text remaining to possibly match. - runtextpos <= runtextend - _minRequiredLength; + protected override bool FindFirstChar() => true; // The logic is all in Go. protected override void Go() { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs index 3f965cec7cd93..8269e123a6f12 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs @@ -178,13 +178,11 @@ private IEnumerable> Step(List> states } private BDD ToBDD(S pred) => _solver.ConvertToCharSet(SymbolicRegexRunnerFactory.s_unicode._solver, pred); + private T Choose(IList elems) => elems[_random.Next(elems.Count)]; - private T Choose(IEnumerable elems) - { - List list = new List(elems); - return list[_random.Next(list.Count)]; - } + private char ChooseChar((uint, uint) pair) => (char)_random.Next((int)pair.Item1, (int)pair.Item2 + 1); + private char ChooseChar(BDD bdd) { Debug.Assert(!bdd.IsEmpty); @@ -192,8 +190,10 @@ private char ChooseChar(BDD bdd) BDD bdd1 = SymbolicRegexRunnerFactory.s_unicode._solver.And(bdd, _ascii); return ChooseChar(Choose(((CharSetSolver)SymbolicRegexRunnerFactory.s_unicode._solver).ToRanges(bdd1.IsEmpty ? bdd : bdd1))); } + private bool ChooseRandomlyTrueOrFalse() => _random.Next(100) < 50; /// Returns true if some state is unconditionally final + private bool IsFinal(IEnumerable> states) { foreach (SymbolicRegexNode state in states) @@ -205,6 +205,7 @@ private bool IsFinal(IEnumerable> states) } return false; } + /// Returns true if some state can be final private bool CanBeFinal(IEnumerable> states) { @@ -217,6 +218,7 @@ private bool CanBeFinal(IEnumerable> states) } return false; } + /// Returns true if some state is final in the given context private bool IsFinal(IEnumerable> states, uint context) { @@ -229,7 +231,9 @@ private bool IsFinal(IEnumerable> states, uint context) } return false; } + private bool IsWordchar(S pred) => _solver.IsSatisfiable(_solver.And(pred, _root._builder._wordLetterPredicateForAnchors)); + private bool IsNewline(S pred) => _solver.IsSatisfiable(_solver.And(pred, _root._builder._newLinePredicate)); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs index 73b7249b408df..5900e44a29738 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs @@ -1,11 +1,13 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics.CodeAnalysis; using System.IO; namespace System.Text.RegularExpressions.Symbolic.Unicode { #if DEBUG + [ExcludeFromCodeCoverage] internal static class GeneratorHelper { public static void WriteInt64ArrayInitSyntax(StreamWriter sw, long[] values) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs index 00098b703b963..24d4ae4541372 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs @@ -3,12 +3,14 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; namespace System.Text.RegularExpressions.Symbolic.Unicode { #if DEBUG + [ExcludeFromCodeCoverage] internal static class IgnoreCaseRelationGenerator { private const string DefaultCultureName = "en-US"; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs index 32b09bf1d14ae..c3ced759b0222 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/UnicodeCategoryRangesGenerator.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; @@ -10,6 +11,7 @@ namespace System.Text.RegularExpressions.Symbolic.Unicode { #if DEBUG /// Utility for generating unicode category ranges and corresponing binary decision diagrams. + [ExcludeFromCodeCoverage] internal static class UnicodeCategoryRangesGenerator { /// Generator for BDD Unicode category definitions. @@ -88,6 +90,7 @@ private static void WriteSerializedBDDs(StreamWriter sw) } /// Used internally for creating a collection of ranges for serialization. + [ExcludeFromCodeCoverage] internal sealed class Ranges { public readonly List ranges = new List(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs index 247c60ff46bd3..e15d49c78ff4c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -114,5 +114,21 @@ public static TResult CallOnEmptyStack(Func func(arg1, arg2, arg3)) .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The type of the fourth argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + /// The third argument to pass to the function. + /// The fourth argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => + Task.Run(() => func(arg1, arg2, arg3, arg4)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs index 5b43fab690865..3204d7a989268 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs @@ -13,119 +13,121 @@ public class RegexGroupTests { public static IEnumerable Groups_Basic_TestData() { - // (A - B) B is a subset of A(ie B only contains chars that are in A) - yield return new object[] { null, "[abcd-[d]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + // (A - B) B is a subset of A(ie B only contains chars that are in A) + yield return new object[] { engine, null, "[abcd-[d]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; - yield return new object[] { null, @"[\d-[357]]+", "33312468955", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[357]]+", "51246897", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[357]]+", "3312468977", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357]]+", "33312468955", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357]]+", "51246897", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357]]+", "3312468977", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\w-[b-y]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\w-[\d]]+", "0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; - yield return new object[] { null, @"[\w-[\p{Ll}]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; + yield return new object[] { engine, null, @"[\w-[\d]]+", "0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; + yield return new object[] { engine, null, @"[\w-[\p{Ll}]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; - yield return new object[] { null, @"[\d-[13579]]+", "1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; + yield return new object[] { engine, null, @"[\d-[13579]]+", "1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; - yield return new object[] { null, @"[\p{Ll}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\p{Nd}-[2468]]+", "20135798", RegexOptions.None, new string[] { "013579" } }; + yield return new object[] { engine, null, @"[\p{Ll}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\p{Nd}-[2468]]+", "20135798", RegexOptions.None, new string[] { "013579" } }; - yield return new object[] { null, @"[\P{Lu}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\P{Nd}-[\p{Ll}]]+", "az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[ae-z]]+", "aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[\p{Ll}]]+", "az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; - // (A - B) B is a superset of A (ie B contains chars that are in A plus other chars that are not in A) - yield return new object[] { null, "[abcd-[def]]+", "fedddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; + // (A - B) B is a superset of A (ie B contains chars that are in A plus other chars that are not in A) + yield return new object[] { engine, null, "[abcd-[def]]+", "fedddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; - yield return new object[] { null, @"[\d-[357a-z]]+", "az33312468955", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[de357fgA-Z]]+", "AZ51246897", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\d-[357\p{Ll}]]+", "az3312468977", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357a-z]]+", "az33312468955", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[de357fgA-Z]]+", "AZ51246897", RegexOptions.None, new string[] { "124689" } }; + yield return new object[] { engine, null, @"[\d-[357\p{Ll}]]+", "az3312468977", RegexOptions.None, new string[] { "124689" } }; - yield return new object[] { null, @"[\w-[b-y\s]]+", " \tbbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y\s]]+", " \tbbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\w-[\d\p{Po}]]+", "!#0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; - yield return new object[] { null, @"[\w-[\p{Ll}\s]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; + yield return new object[] { engine, null, @"[\w-[\d\p{Po}]]+", "!#0AZaz9", RegexOptions.None, new string[] { "AZaz" } }; + yield return new object[] { engine, null, @"[\w-[\p{Ll}\s]]+", "a09AZz", RegexOptions.None, new string[] { "09AZ" } }; - yield return new object[] { null, @"[\d-[13579a-zA-Z]]+", "AZ1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579abcd]]+", "abcd\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[13579\s]]+", " \t\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; + yield return new object[] { engine, null, @"[\d-[13579a-zA-Z]]+", "AZ1024689", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579abcd]]+", "abcd\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[13579\s]]+", " \t\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; - yield return new object[] { null, @"[\w-[b-y\p{Po}]]+", "!#bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y\p{Po}]]+", "!#bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\w-[b-y!.,]]+", "!.,bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, "[\\w-[b-y\x00-\x0F]]+", "\0bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, @"[\w-[b-y!.,]]+", "!.,bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; + yield return new object[] { engine, null, "[\\w-[b-y\x00-\x0F]]+", "\0bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "aaaABCD09zzz" } }; - yield return new object[] { null, @"[\p{Ll}-[ae-z0-9]]+", "09aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\p{Nd}-[2468az]]+", "az20135798", RegexOptions.None, new string[] { "013579" } }; + yield return new object[] { engine, null, @"[\p{Ll}-[ae-z0-9]]+", "09aaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\p{Nd}-[2468az]]+", "az20135798", RegexOptions.None, new string[] { "013579" } }; - yield return new object[] { null, @"[\P{Lu}-[ae-zA-Z]]+", "AZaaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; - yield return new object[] { null, @"[\P{Nd}-[\p{Ll}0123456789]]+", "09az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[ae-zA-Z]]+", "AZaaabbbcccdddeee", RegexOptions.None, new string[] { "bbbcccddd" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[\p{Ll}0123456789]]+", "09az09AZ'[]", RegexOptions.None, new string[] { "AZ'[]" } }; - // (A - B) B only contains chars that are not in A - yield return new object[] { null, "[abc-[defg]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; + // (A - B) B only contains chars that are not in A + yield return new object[] { engine, null, "[abc-[defg]]+", "dddaabbccddd", RegexOptions.None, new string[] { "aabbcc" } }; - yield return new object[] { null, @"[\d-[abc]]+", "abc09abc", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\d-[a-zA-Z]]+", "az09AZ", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\d-[\p{Ll}]]+", "az09az", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\d-[abc]]+", "abc09abc", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\d-[a-zA-Z]]+", "az09AZ", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\d-[\p{Ll}]]+", "az09az", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\w-[\x00-\x0F]]+", "bbbaaaABYZ09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABYZ09zzzyyy" } }; + yield return new object[] { engine, null, @"[\w-[\x00-\x0F]]+", "bbbaaaABYZ09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABYZ09zzzyyy" } }; - yield return new object[] { null, @"[\w-[\s]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; - yield return new object[] { null, @"[\w-[\W]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; - yield return new object[] { null, @"[\w-[\p{Po}]]+", "#a09AZz!", RegexOptions.None, new string[] { "a09AZz" } }; + yield return new object[] { engine, null, @"[\w-[\s]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; + yield return new object[] { engine, null, @"[\w-[\W]]+", "0AZaz9", RegexOptions.None, new string[] { "0AZaz9" } }; + yield return new object[] { engine, null, @"[\w-[\p{Po}]]+", "#a09AZz!", RegexOptions.None, new string[] { "a09AZz" } }; - yield return new object[] { null, @"[\d-[\D]]+", "azAZ1024689", RegexOptions.ECMAScript, new string[] { "1024689" } }; - yield return new object[] { null, @"[\d-[a-zA-Z]]+", "azAZ\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; - yield return new object[] { null, @"[\d-[\p{Ll}]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; + yield return new object[] { engine, null, @"[\d-[\D]]+", "azAZ1024689", RegexOptions.ECMAScript, new string[] { "1024689" } }; + yield return new object[] { engine, null, @"[\d-[a-zA-Z]]+", "azAZ\x066102468\x0660", RegexOptions.ECMAScript, new string[] { "02468" } }; + yield return new object[] { engine, null, @"[\d-[\p{Ll}]]+", "\x066102468\x0660", RegexOptions.None, new string[] { "\x066102468\x0660" } }; - yield return new object[] { null, @"[a-zA-Z0-9-[\s]]+", " \tazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; + yield return new object[] { engine, null, @"[a-zA-Z0-9-[\s]]+", " \tazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; - yield return new object[] { null, @"[a-zA-Z0-9-[\W]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; - yield return new object[] { null, @"[a-zA-Z0-9-[^a-zA-Z0-9]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; + yield return new object[] { engine, null, @"[a-zA-Z0-9-[\W]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; + yield return new object[] { engine, null, @"[a-zA-Z0-9-[^a-zA-Z0-9]]+", "bbbaaaABCD09zzzyyy", RegexOptions.None, new string[] { "bbbaaaABCD09zzzyyy" } }; - yield return new object[] { null, @"[\p{Ll}-[A-Z]]+", "AZaz09", RegexOptions.None, new string[] { "az" } }; - yield return new object[] { null, @"[\p{Nd}-[a-z]]+", "az09", RegexOptions.None, new string[] { "09" } }; + yield return new object[] { engine, null, @"[\p{Ll}-[A-Z]]+", "AZaz09", RegexOptions.None, new string[] { "az" } }; + yield return new object[] { engine, null, @"[\p{Nd}-[a-z]]+", "az09", RegexOptions.None, new string[] { "09" } }; - yield return new object[] { null, @"[\P{Lu}-[\p{Lu}]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; - yield return new object[] { null, @"[\P{Lu}-[A-Z]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; - yield return new object[] { null, @"[\P{Nd}-[\p{Nd}]]+", "azAZ09", RegexOptions.None, new string[] { "azAZ" } }; - yield return new object[] { null, @"[\P{Nd}-[2-8]]+", "1234567890azAZ1234567890", RegexOptions.None, new string[] { "azAZ" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[\p{Lu}]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; + yield return new object[] { engine, null, @"[\P{Lu}-[A-Z]]+", "AZazAZ", RegexOptions.None, new string[] { "az" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[\p{Nd}]]+", "azAZ09", RegexOptions.None, new string[] { "azAZ" } }; + yield return new object[] { engine, null, @"[\P{Nd}-[2-8]]+", "1234567890azAZ1234567890", RegexOptions.None, new string[] { "azAZ" } }; - // Alternating construct - yield return new object[] { null, @"([ ]|[\w-[0-9]])+", "09az AZ90", RegexOptions.None, new string[] { "az AZ", "Z" } }; - yield return new object[] { null, @"([0-9-[02468]]|[0-9-[13579]])+", "az1234567890za", RegexOptions.None, new string[] { "1234567890", "0" } }; - yield return new object[] { null, @"([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+", "azBCDE1234567890BCDEFza", RegexOptions.None, new string[] { "BCDE1234567890BCDE", "E" } }; - yield return new object[] { null, @"([\p{Ll}-[aeiou]]|[^\w-[\s]])+", "aeiobcdxyz!@#aeio", RegexOptions.None, new string[] { "bcdxyz!@#", "#" } }; - yield return new object[] { null, @"(?:hello|hi){1,3}", "hello", RegexOptions.None, new string[] { "hello" } }; - yield return new object[] { null, @"(hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi", "hi" } }; - yield return new object[] { null, @"(?:hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; - yield return new object[] { null, @"(?:hello|hi){2,2}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; - yield return new object[] { null, @"(?:hello|hi){2,2}?", "hellohihihello", RegexOptions.None, new string[] { "hellohi" } }; - yield return new object[] { null, @"(?:abc|def|ghi|hij|klm|no){1,4}", "this is a test nonoabcxyz this is only a test", RegexOptions.None, new string[] { "nonoabc" } }; - yield return new object[] { null, @"xyz(abc|def)xyz", "abcxyzdefxyzabc", RegexOptions.None, new string[] { "xyzdefxyz", "def" } }; - yield return new object[] { null, @"abc|(?:def|ghi)", "ghi", RegexOptions.None, new string[] { "ghi" } }; - yield return new object[] { null, @"abc|(def|ghi)", "def", RegexOptions.None, new string[] { "def", "def" } }; + // Alternating construct + yield return new object[] { engine, null, @"([ ]|[\w-[0-9]])+", "09az AZ90", RegexOptions.None, new string[] { "az AZ", "Z" } }; + yield return new object[] { engine, null, @"([0-9-[02468]]|[0-9-[13579]])+", "az1234567890za", RegexOptions.None, new string[] { "1234567890", "0" } }; + yield return new object[] { engine, null, @"([^0-9-[a-zAE-Z]]|[\w-[a-zAF-Z]])+", "azBCDE1234567890BCDEFza", RegexOptions.None, new string[] { "BCDE1234567890BCDE", "E" } }; + yield return new object[] { engine, null, @"([\p{Ll}-[aeiou]]|[^\w-[\s]])+", "aeiobcdxyz!@#aeio", RegexOptions.None, new string[] { "bcdxyz!@#", "#" } }; + yield return new object[] { engine, null, @"(?:hello|hi){1,3}", "hello", RegexOptions.None, new string[] { "hello" } }; + yield return new object[] { engine, null, @"(hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi", "hi" } }; + yield return new object[] { engine, null, @"(?:hello|hi){1,3}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; + yield return new object[] { engine, null, @"(?:hello|hi){2,2}", "hellohihey", RegexOptions.None, new string[] { "hellohi" } }; + yield return new object[] { engine, null, @"(?:hello|hi){2,2}?", "hellohihihello", RegexOptions.None, new string[] { "hellohi" } }; + yield return new object[] { engine, null, @"(?:abc|def|ghi|hij|klm|no){1,4}", "this is a test nonoabcxyz this is only a test", RegexOptions.None, new string[] { "nonoabc" } }; + yield return new object[] { engine, null, @"xyz(abc|def)xyz", "abcxyzdefxyzabc", RegexOptions.None, new string[] { "xyzdefxyz", "def" } }; + yield return new object[] { engine, null, @"abc|(?:def|ghi)", "ghi", RegexOptions.None, new string[] { "ghi" } }; + yield return new object[] { engine, null, @"abc|(def|ghi)", "def", RegexOptions.None, new string[] { "def", "def" } }; - // Multiple character classes using character class subtraction - yield return new object[] { null, @"98[\d-[9]][\d-[8]][\d-[0]]", "98911 98881 98870 98871", RegexOptions.None, new string[] { "98871" } }; - yield return new object[] { null, @"m[\w-[^aeiou]][\w-[^aeiou]]t", "mbbt mect meet", RegexOptions.None, new string[] { "meet" } }; + // Multiple character classes using character class subtraction + yield return new object[] { engine, null, @"98[\d-[9]][\d-[8]][\d-[0]]", "98911 98881 98870 98871", RegexOptions.None, new string[] { "98871" } }; + yield return new object[] { engine, null, @"m[\w-[^aeiou]][\w-[^aeiou]]t", "mbbt mect meet", RegexOptions.None, new string[] { "meet" } }; - // Negation with character class subtraction - yield return new object[] { null, "[abcdef-[^bce]]+", "adfbcefda", RegexOptions.None, new string[] { "bce" } }; - yield return new object[] { null, "[^cde-[ag]]+", "agbfxyzga", RegexOptions.None, new string[] { "bfxyz" } }; + // Negation with character class subtraction + yield return new object[] { engine, null, "[abcdef-[^bce]]+", "adfbcefda", RegexOptions.None, new string[] { "bce" } }; + yield return new object[] { engine, null, "[^cde-[ag]]+", "agbfxyzga", RegexOptions.None, new string[] { "bfxyz" } }; - // Misc The idea here is come up with real world examples of char class subtraction. Things that - // would be difficult to define without it - yield return new object[] { null, @"[\p{L}-[^\p{Lu}]]+", "09',.abcxyzABCXYZ", RegexOptions.None, new string[] { "ABCXYZ" } }; + // Misc The idea here is come up with real world examples of char class subtraction. Things that + // would be difficult to define without it + yield return new object[] { engine, null, @"[\p{L}-[^\p{Lu}]]+", "09',.abcxyzABCXYZ", RegexOptions.None, new string[] { "ABCXYZ" } }; - yield return new object[] { null, @"[\p{IsGreek}-[\P{Lu}]]+", "\u0390\u03FE\u0386\u0388\u03EC\u03EE\u0400", RegexOptions.None, new string[] { "\u03FE\u0386\u0388\u03EC\u03EE" } }; - yield return new object[] { null, @"[\p{IsBasicLatin}-[G-L]]+", "GAFMZL", RegexOptions.None, new string[] { "AFMZ" } }; + yield return new object[] { engine, null, @"[\p{IsGreek}-[\P{Lu}]]+", "\u0390\u03FE\u0386\u0388\u03EC\u03EE\u0400", RegexOptions.None, new string[] { "\u03FE\u0386\u0388\u03EC\u03EE" } }; + yield return new object[] { engine, null, @"[\p{IsBasicLatin}-[G-L]]+", "GAFMZL", RegexOptions.None, new string[] { "AFMZ" } }; - yield return new object[] { null, "[a-zA-Z-[aeiouAEIOU]]+", "aeiouAEIOUbcdfghjklmnpqrstvwxyz", RegexOptions.None, new string[] { "bcdfghjklmnpqrstvwxyz" } }; + yield return new object[] { engine, null, "[a-zA-Z-[aeiouAEIOU]]+", "aeiouAEIOUbcdfghjklmnpqrstvwxyz", RegexOptions.None, new string[] { "bcdfghjklmnpqrstvwxyz" } }; - // The following is an overly complex way of matching an ip address using char class subtraction - yield return new object[] { null, @"^ + // The following is an overly complex way of matching an ip address using char class subtraction + yield return new object[] { engine, null, @"^ (?^ ( ( @@ -157,370 +159,370 @@ public static IEnumerable Groups_Basic_TestData() )$" , "255", RegexOptions.IgnorePatternWhitespace, new string[] { "255", "255", "2", "5", "5", "", "255", "2", "5" } }; - // Character Class Substraction - yield return new object[] { null, @"[abcd\-d-[bc]]+", "bbbaaa---dddccc", RegexOptions.None, new string[] { "aaa---ddd" } }; - yield return new object[] { null, @"[^a-f-[\x00-\x60\u007B-\uFFFF]]+", "aaafffgggzzz{{{", RegexOptions.None, new string[] { "gggzzz" } }; - yield return new object[] { null, @"[\[\]a-f-[[]]+", "gggaaafff]]][[[", RegexOptions.None, new string[] { "aaafff]]]" } }; - yield return new object[] { null, @"[\[\]a-f-[]]]+", "gggaaafff[[[]]]", RegexOptions.None, new string[] { "aaafff[[[" } }; - - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[-[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; - - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; - yield return new object[] { null, @"[ab\-\[cd-[[]]]]", "-]]", RegexOptions.None, new string[] { "-]]" } }; - - yield return new object[] { null, @"[a-[c-e]]+", "bbbaaaccc", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[a-[c-e]]+", "```aaaccc", RegexOptions.None, new string[] { "aaa" } }; - - yield return new object[] { null, @"[a-d\--[bc]]+", "cccaaa--dddbbb", RegexOptions.None, new string[] { "aaa--ddd" } }; - - // Not Character class substraction - yield return new object[] { null, @"[\0- [bc]+", "!!!\0\0\t\t [[[[bbbcccaaa", RegexOptions.None, new string[] { "\0\0\t\t [[[[bbbccc" } }; - yield return new object[] { null, "[[abcd]-[bc]]+", "a-b]", RegexOptions.None, new string[] { "a-b]" } }; - yield return new object[] { null, "[-[e-g]+", "ddd[[[---eeefffggghhh", RegexOptions.None, new string[] { "[[[---eeefffggg" } }; - yield return new object[] { null, "[-e-g]+", "ddd---eeefffggghhh", RegexOptions.None, new string[] { "---eeefffggg" } }; - yield return new object[] { null, "[a-e - m-p]+", "---a b c d e m n o p---", RegexOptions.None, new string[] { "a b c d e m n o p" } }; - yield return new object[] { null, "[^-[bc]]", "b] c] -] aaaddd]", RegexOptions.None, new string[] { "d]" } }; - yield return new object[] { null, "[^-[bc]]", "b] c] -] aaa]ddd]", RegexOptions.None, new string[] { "a]" } }; - - // Make sure we correctly handle \- - yield return new object[] { null, @"[a\-[bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; - yield return new object[] { null, @"[a\-[\-\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; - yield return new object[] { null, @"[a\-\[\-\[\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; - yield return new object[] { null, @"[abc\--[b]]+", "[[[```bbbaaa---cccddd", RegexOptions.None, new string[] { "aaa---ccc" } }; - yield return new object[] { null, @"[abc\-z-[b]]+", "```aaaccc---zzzbbb", RegexOptions.None, new string[] { "aaaccc---zzz" } }; - yield return new object[] { null, @"[a-d\-[b]+", "```aaabbbcccddd----[[[[]]]", RegexOptions.None, new string[] { "aaabbbcccddd----[[[[" } }; - yield return new object[] { null, @"[abcd\-d\-[bc]+", "bbbaaa---[[[dddccc", RegexOptions.None, new string[] { "bbbaaa---[[[dddccc" } }; - - // Everything works correctly with option RegexOptions.IgnorePatternWhitespace - yield return new object[] { null, "[a - c - [ b ] ]+", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { " ]]]" } }; - yield return new object[] { null, "[a - c - [ b ] +", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { "aaa ccc [[[[ bbb " } }; - - // Unicode Char Classes - yield return new object[] { null, @"(\p{Lu}\w*)\s(\p{Lu}\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"(\p{Lu}\p{Ll}*)\s(\p{Lu}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"(\P{Ll}\p{Ll}*)\s(\P{Ll}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"(\P{Lu}+\p{Lu})\s(\P{Lu}+\p{Lu})", "hellO worlD", RegexOptions.None, new string[] { "hellO worlD", "hellO", "worlD" } }; - yield return new object[] { null, @"(\p{Lt}\w*)\s(\p{Lt}*\w*)", "\u01C5ello \u01C5orld", RegexOptions.None, new string[] { "\u01C5ello \u01C5orld", "\u01C5ello", "\u01C5orld" } }; - yield return new object[] { null, @"(\P{Lt}\w*)\s(\P{Lt}*\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - - // Character ranges IgnoreCase - yield return new object[] { null, @"[@-D]+", "eE?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { "@ABCDabcd" } }; - yield return new object[] { null, @"[>-D]+", "eE=>?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { ">?@ABCDabcd" } }; - yield return new object[] { null, @"[\u0554-\u0557]+", "\u0583\u0553\u0554\u0555\u0556\u0584\u0585\u0586\u0557\u0558", RegexOptions.IgnoreCase, new string[] { "\u0554\u0555\u0556\u0584\u0585\u0586\u0557" } }; - yield return new object[] { null, @"[X-\]]+", "wWXYZxyz[\\]^", RegexOptions.IgnoreCase, new string[] { "XYZxyz[\\]" } }; - yield return new object[] { null, @"[X-\u0533]+", "\u0551\u0554\u0560AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563\u0564", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563" } }; - yield return new object[] { null, @"[X-a]+", "wWAXYZaxyz", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz" } }; - yield return new object[] { null, @"[X-c]+", "wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "ABCXYZabcxyz" } }; - yield return new object[] { null, @"[X-\u00C0]+", "\u00C1\u00E1\u00C0\u00E0wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "\u00C0\u00E0wWABCXYZabcxyz" } }; - yield return new object[] { null, @"[\u0100\u0102\u0104]+", "\u00FF \u0100\u0102\u0104\u0101\u0103\u0105\u0106", RegexOptions.IgnoreCase, new string[] { "\u0100\u0102\u0104\u0101\u0103\u0105" } }; - yield return new object[] { null, @"[B-D\u0130]+", "aAeE\u0129\u0131\u0068 BCDbcD\u0130\u0069\u0070", RegexOptions.IgnoreCase, new string[] { "BCDbcD\u0130\u0069" } }; - yield return new object[] { null, @"[\u013B\u013D\u013F]+", "\u013A\u013B\u013D\u013F\u013C\u013E\u0140\u0141", RegexOptions.IgnoreCase, new string[] { "\u013B\u013D\u013F\u013C\u013E\u0140" } }; - - // Escape Chars - yield return new object[] { null, "(Cat)\r(Dog)", "Cat\rDog", RegexOptions.None, new string[] { "Cat\rDog", "Cat", "Dog" } }; - yield return new object[] { null, "(Cat)\t(Dog)", "Cat\tDog", RegexOptions.None, new string[] { "Cat\tDog", "Cat", "Dog" } }; - yield return new object[] { null, "(Cat)\f(Dog)", "Cat\fDog", RegexOptions.None, new string[] { "Cat\fDog", "Cat", "Dog" } }; - - // Miscellaneous { witout matching } - yield return new object[] { null, @"{5", "hello {5 world", RegexOptions.None, new string[] { "{5" } }; - yield return new object[] { null, @"{5,", "hello {5, world", RegexOptions.None, new string[] { "{5," } }; - yield return new object[] { null, @"{5,6", "hello {5,6 world", RegexOptions.None, new string[] { "{5,6" } }; - - // Miscellaneous inline options - yield return new object[] { null, @"(?n:(?cat)(\s+)(?dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(?n:(cat)(\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog" } }; - yield return new object[] { null, @"(?n:(cat)(?\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"(?x: + // Character Class Substraction + yield return new object[] { engine, null, @"[abcd\-d-[bc]]+", "bbbaaa---dddccc", RegexOptions.None, new string[] { "aaa---ddd" } }; + yield return new object[] { engine, null, @"[^a-f-[\x00-\x60\u007B-\uFFFF]]+", "aaafffgggzzz{{{", RegexOptions.None, new string[] { "gggzzz" } }; + yield return new object[] { engine, null, @"[\[\]a-f-[[]]+", "gggaaafff]]][[[", RegexOptions.None, new string[] { "aaafff]]]" } }; + yield return new object[] { engine, null, @"[\[\]a-f-[]]]+", "gggaaafff[[[]]]", RegexOptions.None, new string[] { "aaafff[[[" } }; + + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[-[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; + + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "a]]", RegexOptions.None, new string[] { "a]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "b]]", RegexOptions.None, new string[] { "b]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "c]]", RegexOptions.None, new string[] { "c]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "d]]", RegexOptions.None, new string[] { "d]]" } }; + yield return new object[] { engine, null, @"[ab\-\[cd-[[]]]]", "-]]", RegexOptions.None, new string[] { "-]]" } }; + + yield return new object[] { engine, null, @"[a-[c-e]]+", "bbbaaaccc", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[a-[c-e]]+", "```aaaccc", RegexOptions.None, new string[] { "aaa" } }; + + yield return new object[] { engine, null, @"[a-d\--[bc]]+", "cccaaa--dddbbb", RegexOptions.None, new string[] { "aaa--ddd" } }; + + // Not Character class substraction + yield return new object[] { engine, null, @"[\0- [bc]+", "!!!\0\0\t\t [[[[bbbcccaaa", RegexOptions.None, new string[] { "\0\0\t\t [[[[bbbccc" } }; + yield return new object[] { engine, null, "[[abcd]-[bc]]+", "a-b]", RegexOptions.None, new string[] { "a-b]" } }; + yield return new object[] { engine, null, "[-[e-g]+", "ddd[[[---eeefffggghhh", RegexOptions.None, new string[] { "[[[---eeefffggg" } }; + yield return new object[] { engine, null, "[-e-g]+", "ddd---eeefffggghhh", RegexOptions.None, new string[] { "---eeefffggg" } }; + yield return new object[] { engine, null, "[a-e - m-p]+", "---a b c d e m n o p---", RegexOptions.None, new string[] { "a b c d e m n o p" } }; + yield return new object[] { engine, null, "[^-[bc]]", "b] c] -] aaaddd]", RegexOptions.None, new string[] { "d]" } }; + yield return new object[] { engine, null, "[^-[bc]]", "b] c] -] aaa]ddd]", RegexOptions.None, new string[] { "a]" } }; + + // Make sure we correctly handle \- + yield return new object[] { engine, null, @"[a\-[bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; + yield return new object[] { engine, null, @"[a\-[\-\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; + yield return new object[] { engine, null, @"[a\-\[\-\[\-bc]+", "```bbbaaa---[[[cccddd", RegexOptions.None, new string[] { "bbbaaa---[[[ccc" } }; + yield return new object[] { engine, null, @"[abc\--[b]]+", "[[[```bbbaaa---cccddd", RegexOptions.None, new string[] { "aaa---ccc" } }; + yield return new object[] { engine, null, @"[abc\-z-[b]]+", "```aaaccc---zzzbbb", RegexOptions.None, new string[] { "aaaccc---zzz" } }; + yield return new object[] { engine, null, @"[a-d\-[b]+", "```aaabbbcccddd----[[[[]]]", RegexOptions.None, new string[] { "aaabbbcccddd----[[[[" } }; + yield return new object[] { engine, null, @"[abcd\-d\-[bc]+", "bbbaaa---[[[dddccc", RegexOptions.None, new string[] { "bbbaaa---[[[dddccc" } }; + + // Everything works correctly with option RegexOptions.IgnorePatternWhitespace + yield return new object[] { engine, null, "[a - c - [ b ] ]+", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { " ]]]" } }; + yield return new object[] { engine, null, "[a - c - [ b ] +", "dddaaa ccc [[[[ bbb ]]]", RegexOptions.IgnorePatternWhitespace, new string[] { "aaa ccc [[[[ bbb " } }; + + // Unicode Char Classes + yield return new object[] { engine, null, @"(\p{Lu}\w*)\s(\p{Lu}\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"(\p{Lu}\p{Ll}*)\s(\p{Lu}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"(\P{Ll}\p{Ll}*)\s(\P{Ll}\p{Ll}*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"(\P{Lu}+\p{Lu})\s(\P{Lu}+\p{Lu})", "hellO worlD", RegexOptions.None, new string[] { "hellO worlD", "hellO", "worlD" } }; + yield return new object[] { engine, null, @"(\p{Lt}\w*)\s(\p{Lt}*\w*)", "\u01C5ello \u01C5orld", RegexOptions.None, new string[] { "\u01C5ello \u01C5orld", "\u01C5ello", "\u01C5orld" } }; + yield return new object[] { engine, null, @"(\P{Lt}\w*)\s(\P{Lt}*\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + + // Character ranges IgnoreCase + yield return new object[] { engine, null, @"[@-D]+", "eE?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { "@ABCDabcd" } }; + yield return new object[] { engine, null, @"[>-D]+", "eE=>?@ABCDabcdeE", RegexOptions.IgnoreCase, new string[] { ">?@ABCDabcd" } }; + yield return new object[] { engine, null, @"[\u0554-\u0557]+", "\u0583\u0553\u0554\u0555\u0556\u0584\u0585\u0586\u0557\u0558", RegexOptions.IgnoreCase, new string[] { "\u0554\u0555\u0556\u0584\u0585\u0586\u0557" } }; + yield return new object[] { engine, null, @"[X-\]]+", "wWXYZxyz[\\]^", RegexOptions.IgnoreCase, new string[] { "XYZxyz[\\]" } }; + yield return new object[] { engine, null, @"[X-\u0533]+", "\u0551\u0554\u0560AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563\u0564", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz\u0531\u0532\u0533\u0561\u0562\u0563" } }; + yield return new object[] { engine, null, @"[X-a]+", "wWAXYZaxyz", RegexOptions.IgnoreCase, new string[] { "AXYZaxyz" } }; + yield return new object[] { engine, null, @"[X-c]+", "wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "ABCXYZabcxyz" } }; + yield return new object[] { engine, null, @"[X-\u00C0]+", "\u00C1\u00E1\u00C0\u00E0wWABCXYZabcxyz", RegexOptions.IgnoreCase, new string[] { "\u00C0\u00E0wWABCXYZabcxyz" } }; + yield return new object[] { engine, null, @"[\u0100\u0102\u0104]+", "\u00FF \u0100\u0102\u0104\u0101\u0103\u0105\u0106", RegexOptions.IgnoreCase, new string[] { "\u0100\u0102\u0104\u0101\u0103\u0105" } }; + yield return new object[] { engine, null, @"[B-D\u0130]+", "aAeE\u0129\u0131\u0068 BCDbcD\u0130\u0069\u0070", RegexOptions.IgnoreCase, new string[] { "BCDbcD\u0130\u0069" } }; + yield return new object[] { engine, null, @"[\u013B\u013D\u013F]+", "\u013A\u013B\u013D\u013F\u013C\u013E\u0140\u0141", RegexOptions.IgnoreCase, new string[] { "\u013B\u013D\u013F\u013C\u013E\u0140" } }; + + // Escape Chars + yield return new object[] { engine, null, "(Cat)\r(Dog)", "Cat\rDog", RegexOptions.None, new string[] { "Cat\rDog", "Cat", "Dog" } }; + yield return new object[] { engine, null, "(Cat)\t(Dog)", "Cat\tDog", RegexOptions.None, new string[] { "Cat\tDog", "Cat", "Dog" } }; + yield return new object[] { engine, null, "(Cat)\f(Dog)", "Cat\fDog", RegexOptions.None, new string[] { "Cat\fDog", "Cat", "Dog" } }; + + // Miscellaneous { witout matching } + yield return new object[] { engine, null, @"{5", "hello {5 world", RegexOptions.None, new string[] { "{5" } }; + yield return new object[] { engine, null, @"{5,", "hello {5, world", RegexOptions.None, new string[] { "{5," } }; + yield return new object[] { engine, null, @"{5,6", "hello {5,6 world", RegexOptions.None, new string[] { "{5,6" } }; + + // Miscellaneous inline options + yield return new object[] { engine, null, @"(?n:(?cat)(\s+)(?dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?n:(cat)(\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog" } }; + yield return new object[] { engine, null, @"(?n:(cat)(?\s+)(dog))", "cat dog", RegexOptions.None, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"(?x: (?cat) # Cat statement (\s+) # Whitespace chars (?dog # Dog statement ))", "cat dog", RegexOptions.None, new string[] { "cat dog", " ", "cat", "dog" } }; - yield return new object[] { null, @"(?+i:cat)", "CAT", RegexOptions.None, new string[] { "CAT" } }; - - // \d, \D, \s, \S, \w, \W, \P, \p inside character range - yield return new object[] { null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.None, new string[] { "cat230927dog", "230927" } }; - yield return new object[] { null, @"([\D]*)dog", "65498catdog58719", RegexOptions.None, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; - - // \x, \u, \a, \b, \e, \f, \n, \r, \t, \v, \c, inside character range - yield return new object[] { null, @"(cat)([\x41]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; - yield return new object[] { null, @"(cat)([\u0041]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; - yield return new object[] { null, @"(cat)([\a]*)(dog)", "cat\a\a\adog", RegexOptions.None, new string[] { "cat\a\a\adog", "cat", "\a\a\a", "dog" } }; - yield return new object[] { null, @"(cat)([\b]*)(dog)", "cat\b\b\bdog", RegexOptions.None, new string[] { "cat\b\b\bdog", "cat", "\b\b\b", "dog" } }; - yield return new object[] { null, @"(cat)([\e]*)(dog)", "cat\u001B\u001B\u001Bdog", RegexOptions.None, new string[] { "cat\u001B\u001B\u001Bdog", "cat", "\u001B\u001B\u001B", "dog" } }; - yield return new object[] { null, @"(cat)([\f]*)(dog)", "cat\f\f\fdog", RegexOptions.None, new string[] { "cat\f\f\fdog", "cat", "\f\f\f", "dog" } }; - yield return new object[] { null, @"(cat)([\r]*)(dog)", "cat\r\r\rdog", RegexOptions.None, new string[] { "cat\r\r\rdog", "cat", "\r\r\r", "dog" } }; - yield return new object[] { null, @"(cat)([\v]*)(dog)", "cat\v\v\vdog", RegexOptions.None, new string[] { "cat\v\v\vdog", "cat", "\v\v\v", "dog" } }; - - // \d, \D, \s, \S, \w, \W, \P, \p inside character range ([0-5]) with ECMA Option - yield return new object[] { null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "230927" } }; - yield return new object[] { null, @"([\D]*)dog", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; - yield return new object[] { null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; - yield return new object[] { null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; - - // \d, \D, \s, \S, \w, \W, \P, \p outside character range ([0-5]) with ECMA Option - yield return new object[] { null, @"(cat)\d*dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "cat" } }; - yield return new object[] { null, @"\D*(dog)", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"(cat)\s*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\S*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"(cat)\w*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; - yield return new object[] { null, @"(cat)\W*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"\p{Lu}(\w*)\s\p{Lu}(\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "ello", "orld" } }; - yield return new object[] { null, @"\P{Ll}\p{Ll}*\s\P{Ll}\p{Ll}*", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World" } }; - - // Use < in a group - yield return new object[] { null, @"cat(?dog)", "catcatdogdogcat", RegexOptions.None, new string[] { "catdog", "dog" } }; - yield return new object[] { null, @"(?cat)\s*(?dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; - yield return new object[] { null, @"(?<1>cat)\s*(?<1>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; - yield return new object[] { null, @"(?<2048>cat)\s*(?<2048>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; - yield return new object[] { null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - yield return new object[] { null, @"(?cat)\w+(?<-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "" } }; - yield return new object[] { null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "_Hello_World_" } }; - yield return new object[] { null, @"(?<1>cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - yield return new object[] { null, @"(?cat)\w+(?<2-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - yield return new object[] { null, @"(?<1>cat)\w+(?<2-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; - - // Quantifiers - yield return new object[] { null, @"(?cat){", "STARTcat{", RegexOptions.None, new string[] { "cat{", "cat" } }; - yield return new object[] { null, @"(?cat){fdsa", "STARTcat{fdsa", RegexOptions.None, new string[] { "cat{fdsa", "cat" } }; - yield return new object[] { null, @"(?cat){1", "STARTcat{1", RegexOptions.None, new string[] { "cat{1", "cat" } }; - yield return new object[] { null, @"(?cat){1END", "STARTcat{1END", RegexOptions.None, new string[] { "cat{1END", "cat" } }; - yield return new object[] { null, @"(?cat){1,", "STARTcat{1,", RegexOptions.None, new string[] { "cat{1,", "cat" } }; - yield return new object[] { null, @"(?cat){1,END", "STARTcat{1,END", RegexOptions.None, new string[] { "cat{1,END", "cat" } }; - yield return new object[] { null, @"(?cat){1,2", "STARTcat{1,2", RegexOptions.None, new string[] { "cat{1,2", "cat" } }; - yield return new object[] { null, @"(?cat){1,2END", "STARTcat{1,2END", RegexOptions.None, new string[] { "cat{1,2END", "cat" } }; - - // Use IgnorePatternWhitespace - yield return new object[] { null, @"(cat) #cat + yield return new object[] { engine, null, @"(?+i:cat)", "CAT", RegexOptions.None, new string[] { "CAT" } }; + + // \d, \D, \s, \S, \w, \W, \P, \p inside character range + yield return new object[] { engine, null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.None, new string[] { "cat230927dog", "230927" } }; + yield return new object[] { engine, null, @"([\D]*)dog", "65498catdog58719", RegexOptions.None, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.None, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.None, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.None, new string[] { "Hello World", "Hello", "World" } }; + + // \x, \u, \a, \b, \e, \f, \n, \r, \t, \v, \c, inside character range + yield return new object[] { engine, null, @"(cat)([\x41]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\u0041]*)(dog)", "catAAAdog", RegexOptions.None, new string[] { "catAAAdog", "cat", "AAA", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\a]*)(dog)", "cat\a\a\adog", RegexOptions.None, new string[] { "cat\a\a\adog", "cat", "\a\a\a", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\b]*)(dog)", "cat\b\b\bdog", RegexOptions.None, new string[] { "cat\b\b\bdog", "cat", "\b\b\b", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\e]*)(dog)", "cat\u001B\u001B\u001Bdog", RegexOptions.None, new string[] { "cat\u001B\u001B\u001Bdog", "cat", "\u001B\u001B\u001B", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\f]*)(dog)", "cat\f\f\fdog", RegexOptions.None, new string[] { "cat\f\f\fdog", "cat", "\f\f\f", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\r]*)(dog)", "cat\r\r\rdog", RegexOptions.None, new string[] { "cat\r\r\rdog", "cat", "\r\r\r", "dog" } }; + yield return new object[] { engine, null, @"(cat)([\v]*)(dog)", "cat\v\v\vdog", RegexOptions.None, new string[] { "cat\v\v\vdog", "cat", "\v\v\v", "dog" } }; + + // \d, \D, \s, \S, \w, \W, \P, \p inside character range ([0-5]) with ECMA Option + yield return new object[] { engine, null, @"cat([\d]*)dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "230927" } }; + yield return new object[] { engine, null, @"([\D]*)dog", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"cat([\s]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"cat([\S]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\w]*)", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"cat([\W]*)dog", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", " " } }; + yield return new object[] { engine, null, @"([\p{Lu}]\w*)\s([\p{Lu}]\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; + yield return new object[] { engine, null, @"([\P{Ll}][\p{Ll}]*)\s([\P{Ll}][\p{Ll}]*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "Hello", "World" } }; + + // \d, \D, \s, \S, \w, \W, \P, \p outside character range ([0-5]) with ECMA Option + yield return new object[] { engine, null, @"(cat)\d*dog", "hello123cat230927dog1412d", RegexOptions.ECMAScript, new string[] { "cat230927dog", "cat" } }; + yield return new object[] { engine, null, @"\D*(dog)", "65498catdog58719", RegexOptions.ECMAScript, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\S*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"(cat)\w*", "sfdcatdog 3270", RegexOptions.ECMAScript, new string[] { "catdog", "cat" } }; + yield return new object[] { engine, null, @"(cat)\W*(dog)", "wiocat dog3270", RegexOptions.ECMAScript, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\p{Lu}(\w*)\s\p{Lu}(\w*)", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World", "ello", "orld" } }; + yield return new object[] { engine, null, @"\P{Ll}\p{Ll}*\s\P{Ll}\p{Ll}*", "Hello World", RegexOptions.ECMAScript, new string[] { "Hello World" } }; + + // Use < in a group + yield return new object[] { engine, null, @"cat(?dog)", "catcatdogdogcat", RegexOptions.None, new string[] { "catdog", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s*(?dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; + yield return new object[] { engine, null, @"(?<1>cat)\s*(?<1>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; + yield return new object[] { engine, null, @"(?<2048>cat)\s*(?<2048>dog)", "catcat dogdogcat", RegexOptions.None, new string[] { "cat dog", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?<-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?<1>cat)\w+(?dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?cat)\w+(?<2-cat>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + yield return new object[] { engine, null, @"(?<1>cat)\w+(?<2-1>dog)", "cat_Hello_World_dog", RegexOptions.None, new string[] { "cat_Hello_World_dog", "", "_Hello_World_" } }; + + // Quantifiers + yield return new object[] { engine, null, @"(?cat){", "STARTcat{", RegexOptions.None, new string[] { "cat{", "cat" } }; + yield return new object[] { engine, null, @"(?cat){fdsa", "STARTcat{fdsa", RegexOptions.None, new string[] { "cat{fdsa", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1", "STARTcat{1", RegexOptions.None, new string[] { "cat{1", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1END", "STARTcat{1END", RegexOptions.None, new string[] { "cat{1END", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,", "STARTcat{1,", RegexOptions.None, new string[] { "cat{1,", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,END", "STARTcat{1,END", RegexOptions.None, new string[] { "cat{1,END", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,2", "STARTcat{1,2", RegexOptions.None, new string[] { "cat{1,2", "cat" } }; + yield return new object[] { engine, null, @"(?cat){1,2END", "STARTcat{1,2END", RegexOptions.None, new string[] { "cat{1,2END", "cat" } }; + + // Use IgnorePatternWhitespace + yield return new object[] { engine, null, @"(cat) #cat \s+ #followed by 1 or more whitespace (dog) #followed by dog ", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat) #cat + yield return new object[] { engine, null, @"(cat) #cat \s+ #followed by 1 or more whitespace (dog) #followed by dog", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat) (?#cat) \s+ (?#followed by 1 or more whitespace) (dog) (?#followed by dog)", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; - - // Back Reference - yield return new object[] { null, @"(?cat)(?dog)\k", "asdfcatdogcatdog", RegexOptions.None, new string[] { "catdogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\k", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\k'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - - yield return new object[] { null, @"(?cat)\s+(?dog)\k<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\k'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.ECMAScript, new string[] { "cat dogcat", "cat", "dog" } }; - - yield return new object[] { null, @"(?cat)\s+(?dog)\k", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; - yield return new object[] { null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.ECMAScript, new string[] { "cat dogdog", "cat", "dog" } }; - - // Octal - yield return new object[] { null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\176)", "hellocat~dogworld", RegexOptions.None, new string[] { "cat~", "cat", "~" } }; - yield return new object[] { null, @"(cat)(\400)", "hellocat\0dogworld", RegexOptions.None, new string[] { "cat\0", "cat", "\0" } }; - yield return new object[] { null, @"(cat)(\300)", "hellocat\u00C0dogworld", RegexOptions.None, new string[] { "cat\u00C0", "cat", "\u00C0" } }; - yield return new object[] { null, @"(cat)(\477)", "hellocat\u003Fdogworld", RegexOptions.None, new string[] { "cat\u003F", "cat", "\u003F" } }; - yield return new object[] { null, @"(cat)(\777)", "hellocat\u00FFdogworld", RegexOptions.None, new string[] { "cat\u00FF", "cat", "\u00FF" } }; - yield return new object[] { null, @"(cat)(\7770)", "hellocat\u00FF0dogworld", RegexOptions.None, new string[] { "cat\u00FF0", "cat", "\u00FF0" } }; - - yield return new object[] { null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; - yield return new object[] { null, @"(cat)(\7)", "hellocat\adogworld", RegexOptions.ECMAScript, new string[] { "cat\a", "cat", "\a" } }; - yield return new object[] { null, @"(cat)(\40)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; - yield return new object[] { null, @"(cat)(\040)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; - yield return new object[] { null, @"(cat)(\176)", "hellocatcat76dogworld", RegexOptions.ECMAScript, new string[] { "catcat76", "cat", "cat76" } }; - yield return new object[] { null, @"(cat)(\377)", "hellocat\u00FFdogworld", RegexOptions.ECMAScript, new string[] { "cat\u00FF", "cat", "\u00FF" } }; - yield return new object[] { null, @"(cat)(\400)", "hellocat 0Fdogworld", RegexOptions.ECMAScript, new string[] { "cat 0", "cat", " 0" } }; - - // Decimal - yield return new object[] { null, @"(cat)\s+(?<2147483646>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(?<2147483647>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; - - // Hex - yield return new object[] { null, @"(cat)(\x2a*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; - yield return new object[] { null, @"(cat)(\x2b*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; - yield return new object[] { null, @"(cat)(\x2c*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; - yield return new object[] { null, @"(cat)(\x2d*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; - yield return new object[] { null, @"(cat)(\x2e*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; - yield return new object[] { null, @"(cat)(\x2f*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; - - yield return new object[] { null, @"(cat)(\x2A*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; - yield return new object[] { null, @"(cat)(\x2B*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; - yield return new object[] { null, @"(cat)(\x2C*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; - yield return new object[] { null, @"(cat)(\x2D*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; - yield return new object[] { null, @"(cat)(\x2E*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; - yield return new object[] { null, @"(cat)(\x2F*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; - - // ScanControl - yield return new object[] { null, @"(cat)(\c@*)(dog)", "asdlkcat\0\0dogiwod", RegexOptions.None, new string[] { "cat\0\0dog", "cat", "\0\0", "dog" } }; - yield return new object[] { null, @"(cat)(\cA*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; - yield return new object[] { null, @"(cat)(\ca*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; - - yield return new object[] { null, @"(cat)(\cC*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; - yield return new object[] { null, @"(cat)(\cc*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; - - yield return new object[] { null, @"(cat)(\cD*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; - yield return new object[] { null, @"(cat)(\cd*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; - - yield return new object[] { null, @"(cat)(\cX*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; - yield return new object[] { null, @"(cat)(\cx*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; - - yield return new object[] { null, @"(cat)(\cZ*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; - yield return new object[] { null, @"(cat)(\cz*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; - - if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759 - { - yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } }; - } + yield return new object[] { engine, null, @"(cat) (?#cat) \s+ (?#followed by 1 or more whitespace) (dog) (?#followed by dog)", "cat dog", RegexOptions.IgnorePatternWhitespace, new string[] { "cat dog", "cat", "dog" } }; + + // Back Reference + yield return new object[] { engine, null, @"(?cat)(?dog)\k", "asdfcatdogcatdog", RegexOptions.None, new string[] { "catdogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\'cat'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\<1>", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\'1'", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.None, new string[] { "cat dogcat", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\1", "asdfcat dogcat dog", RegexOptions.ECMAScript, new string[] { "cat dogcat", "cat", "dog" } }; + + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\k", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.None, new string[] { "cat dogdog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(?cat)\s+(?dog)\2", "asdfcat dogdog dog", RegexOptions.ECMAScript, new string[] { "cat dogdog", "cat", "dog" } }; + + // Octal + yield return new object[] { engine, null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.None, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\176)", "hellocat~dogworld", RegexOptions.None, new string[] { "cat~", "cat", "~" } }; + yield return new object[] { engine, null, @"(cat)(\400)", "hellocat\0dogworld", RegexOptions.None, new string[] { "cat\0", "cat", "\0" } }; + yield return new object[] { engine, null, @"(cat)(\300)", "hellocat\u00C0dogworld", RegexOptions.None, new string[] { "cat\u00C0", "cat", "\u00C0" } }; + yield return new object[] { engine, null, @"(cat)(\477)", "hellocat\u003Fdogworld", RegexOptions.None, new string[] { "cat\u003F", "cat", "\u003F" } }; + yield return new object[] { engine, null, @"(cat)(\777)", "hellocat\u00FFdogworld", RegexOptions.None, new string[] { "cat\u00FF", "cat", "\u00FF" } }; + yield return new object[] { engine, null, @"(cat)(\7770)", "hellocat\u00FF0dogworld", RegexOptions.None, new string[] { "cat\u00FF0", "cat", "\u00FF0" } }; + + yield return new object[] { engine, null, @"(cat)(\077)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\77)", "hellocat?dogworld", RegexOptions.ECMAScript, new string[] { "cat?", "cat", "?" } }; + yield return new object[] { engine, null, @"(cat)(\7)", "hellocat\adogworld", RegexOptions.ECMAScript, new string[] { "cat\a", "cat", "\a" } }; + yield return new object[] { engine, null, @"(cat)(\40)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; + yield return new object[] { engine, null, @"(cat)(\040)", "hellocat dogworld", RegexOptions.ECMAScript, new string[] { "cat ", "cat", " " } }; + yield return new object[] { engine, null, @"(cat)(\176)", "hellocatcat76dogworld", RegexOptions.ECMAScript, new string[] { "catcat76", "cat", "cat76" } }; + yield return new object[] { engine, null, @"(cat)(\377)", "hellocat\u00FFdogworld", RegexOptions.ECMAScript, new string[] { "cat\u00FF", "cat", "\u00FF" } }; + yield return new object[] { engine, null, @"(cat)(\400)", "hellocat 0Fdogworld", RegexOptions.ECMAScript, new string[] { "cat 0", "cat", " 0" } }; + + // Decimal + yield return new object[] { engine, null, @"(cat)\s+(?<2147483646>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(?<2147483647>dog)", "asdlkcat dogiwod", RegexOptions.None, new string[] { "cat dog", "cat", "dog" } }; + + // Hex + yield return new object[] { engine, null, @"(cat)(\x2a*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2b*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2c*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2d*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2e*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2f*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\x2A*)(dog)", "asdlkcat***dogiwod", RegexOptions.None, new string[] { "cat***dog", "cat", "***", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2B*)(dog)", "asdlkcat+++dogiwod", RegexOptions.None, new string[] { "cat+++dog", "cat", "+++", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2C*)(dog)", "asdlkcat,,,dogiwod", RegexOptions.None, new string[] { "cat,,,dog", "cat", ",,,", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2D*)(dog)", "asdlkcat---dogiwod", RegexOptions.None, new string[] { "cat---dog", "cat", "---", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2E*)(dog)", "asdlkcat...dogiwod", RegexOptions.None, new string[] { "cat...dog", "cat", "...", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\x2F*)(dog)", "asdlkcat///dogiwod", RegexOptions.None, new string[] { "cat///dog", "cat", "///", "dog" } }; + + // ScanControl + yield return new object[] { engine, null, @"(cat)(\c@*)(dog)", "asdlkcat\0\0dogiwod", RegexOptions.None, new string[] { "cat\0\0dog", "cat", "\0\0", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cA*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\ca*)(dog)", "asdlkcat\u0001dogiwod", RegexOptions.None, new string[] { "cat\u0001dog", "cat", "\u0001", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cC*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cc*)(dog)", "asdlkcat\u0003dogiwod", RegexOptions.None, new string[] { "cat\u0003dog", "cat", "\u0003", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cD*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cd*)(dog)", "asdlkcat\u0004dogiwod", RegexOptions.None, new string[] { "cat\u0004dog", "cat", "\u0004", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cX*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cx*)(dog)", "asdlkcat\u0018dogiwod", RegexOptions.None, new string[] { "cat\u0018dog", "cat", "\u0018", "dog" } }; + + yield return new object[] { engine, null, @"(cat)(\cZ*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; + yield return new object[] { engine, null, @"(cat)(\cz*)(dog)", "asdlkcat\u001adogiwod", RegexOptions.None, new string[] { "cat\u001adog", "cat", "\u001a", "dog" } }; + + if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759 + { + yield return new object[] { engine, null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } }; + } - // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B - //\A - yield return new object[] { null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\G - yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //^ - yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"mouse\s\n^cat\s+dog", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog" } }; - yield return new object[] { null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" } }; - yield return new object[] { null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\Z - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\z - yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - yield return new object[] { null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; - - //\b - yield return new object[] { null, @"\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @"\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @"\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; - yield return new object[] { null, @"\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "cat" } }; - yield return new object[] { null, @".*\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @".*\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "dog cat" } }; - yield return new object[] { null, @".*\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; - yield return new object[] { null, @".*\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "dog cat" } }; - yield return new object[] { null, @"\b@cat", "123START123@catEND", RegexOptions.None, new string[] { "@cat" } }; - yield return new object[] { null, @"\b\cat)\s+(?dog)\s+\123\s+\234", "asdfcat dog cat23 dog34eia", RegexOptions.ECMAScript, new string[] { "cat dog cat23 dog34", "cat", "dog" } }; - - // Balanced Matching - yield return new object[] { null, @"
+ // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B + //\A + yield return new object[] { engine, null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\Acat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\A(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\G + yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\Gcat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"\G(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //^ + yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"mouse\s\n^cat\s+dog", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog" } }; + yield return new object[] { engine, null, @"^cat\s+dog", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(mouse)\s\n^(cat)\s+(dog)", "mouse\n\ncat \n\n\n dog", RegexOptions.Multiline, new string[] { "mouse\n\ncat \n\n\n dog", "mouse", "cat", "dog" } }; + yield return new object[] { engine, null, @"^(cat)\s+(dog)", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\Z + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\Z", "cat \n\n\n dog\n", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\z + yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"cat\s+dog\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.None, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.Multiline, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + yield return new object[] { engine, null, @"(cat)\s+(dog)\z", "cat \n\n\n dog", RegexOptions.ECMAScript, new string[] { "cat \n\n\n dog", "cat", "dog" } }; + + //\b + yield return new object[] { engine, null, @"\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @"\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @"\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; + yield return new object[] { engine, null, @"\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "cat", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "dog cat mouse", RegexOptions.None, new string[] { "dog cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "cat", RegexOptions.ECMAScript, new string[] { "cat" } }; + yield return new object[] { engine, null, @".*\bcat\b", "dog cat mouse", RegexOptions.ECMAScript, new string[] { "dog cat" } }; + yield return new object[] { engine, null, @"\b@cat", "123START123@catEND", RegexOptions.None, new string[] { "@cat" } }; + yield return new object[] { engine, null, @"\b\cat)\s+(?dog)\s+\123\s+\234", "asdfcat dog cat23 dog34eia", RegexOptions.ECMAScript, new string[] { "cat dog cat23 dog34", "cat", "dog" } }; + + // Balanced Matching + yield return new object[] { engine, null, @"
(?>
(?) |
(?<-DEPTH>) | @@ -529,374 +531,390 @@ public static IEnumerable Groups_Basic_TestData() (?(DEPTH)(?!))
", "
this is some
red
text
", RegexOptions.IgnorePatternWhitespace, new string[] { "
this is some
red
text
", "" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( ((?'open'<+)[^<>]*)+ ((?'close-open'>+)[^<>]*)+ )+", "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", "<02deep_03<03deep_03>>>", "<03deep_03", ">>>", "<", "03deep_03" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( (?<)? [^<>]? (?>)? )*", "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "<01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>>", "", "", "01deep_01<02deep_01<03deep_01>><02deep_02><02deep_03<03deep_03>>" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( (?<[^/<>]*>)? [^<>]? (?]*>)? )*", "Cat", RegexOptions.IgnorePatternWhitespace, new string[] { "Cat", "", "", "Cat" } }; - yield return new object[] { null, @"( + yield return new object[] { engine, null, @"( (?<(?[^/<>]*)>)? [^<>]? (?>)? )*", "catdog", RegexOptions.IgnorePatternWhitespace, new string[] { "catdog", "", "", "a", "dog" } }; - // Balanced Matching With Backtracking - yield return new object[] { null, @"( + // Balanced Matching With Backtracking + yield return new object[] { engine, null, @"( (?<[^/<>]*>)? .? (?]*>)? )* (?(start)(?!)) ", "Cat<<<>>><><<<>>>>", RegexOptions.IgnorePatternWhitespace, new string[] { "Cat<<<>>><><<<>>>>", "", "", "Cat" } }; - // Character Classes and Lazy quantifier - yield return new object[] { null, @"([0-9]+?)([\w]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55", "5", "5" } }; - yield return new object[] { null, @"([0-9]+?)([a-z]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55488a", "55488", "a" } }; + // Character Classes and Lazy quantifier + yield return new object[] { engine, null, @"([0-9]+?)([\w]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55", "5", "5" } }; + yield return new object[] { engine, null, @"([0-9]+?)([a-z]+?)", "55488aheiaheiad", RegexOptions.ECMAScript, new string[] { "55488a", "55488", "a" } }; - // Miscellaneous/Regression scenarios - yield return new object[] { null, @"(?1)(?.*?)(?=2)", "1" + Environment.NewLine + "" + Environment.NewLine + "2", RegexOptions.Singleline | RegexOptions.ExplicitCapture, + // Miscellaneous/Regression scenarios + yield return new object[] { engine, null, @"(?1)(?.*?)(?=2)", "1" + Environment.NewLine + "" + Environment.NewLine + "2", RegexOptions.Singleline | RegexOptions.ExplicitCapture, new string[] { "1" + Environment.NewLine + "" + Environment.NewLine, "1", Environment.NewLine + ""+ Environment.NewLine } }; - yield return new object[] { null, @"\G<%#(?.*?)?%>", @"<%# DataBinder.Eval(this, ""MyNumber"") %>", RegexOptions.Singleline, new string[] { @"<%# DataBinder.Eval(this, ""MyNumber"") %>", @" DataBinder.Eval(this, ""MyNumber"") " } }; - - // Nested Quantifiers - yield return new object[] { null, @"^[abcd]{0,0x10}*$", "a{0,0x10}}}", RegexOptions.None, new string[] { "a{0,0x10}}}" } }; - - // Lazy operator Backtracking - yield return new object[] { null, @"http://([a-zA-z0-9\-]*\.?)*?(:[0-9]*)??/", "http://www.msn.com/", RegexOptions.IgnoreCase, new string[] { "http://www.msn.com/", "com", string.Empty } }; - yield return new object[] { null, @"http://([a-zA-Z0-9\-]*\.?)*?/", @"http://www.google.com/", RegexOptions.IgnoreCase, new string[] { "http://www.google.com/", "com" } }; - - yield return new object[] { null, @"([a-z]*?)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "c", string.Empty, "c" } }; - yield return new object[] { null, @"^([a-z]*?)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; - - // Backtracking - yield return new object[] { null, @"([a-z]*)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; - yield return new object[] { null, @"^([a-z]*)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; - - // Backtracking with multiple (.*) groups -- important ASP.NET scenario - yield return new object[] { null, @"(.*)/(.*).aspx", "/.aspx", RegexOptions.None, new string[] { "/.aspx", string.Empty, string.Empty } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.*)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Backtracking with multiple (.+) groups - yield return new object[] { null, @"(.+)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.+)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Backtracking with (.+) group followed by (.*) - yield return new object[] { null, @"(.+)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.+)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.+)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Backtracking with (.*) group followed by (.+) - yield return new object[] { null, @"(.*)/(.+).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; - yield return new object[] { null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; - yield return new object[] { null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; - - // Quantifiers - yield return new object[] { null, @"a*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a*", "a", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @"a*", "aa", RegexOptions.None, new string[] { "aa" } }; - yield return new object[] { null, @"a*", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a*?", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a*?", "a", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a*?", "aa", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"a+?", "aa", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @"a{1,", "a{1,", RegexOptions.None, new string[] { "a{1," } }; - yield return new object[] { null, @"a{1,3}", "aaaaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a{1,3}?", "aaaaa", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @"a{2,2}", "aaaaa", RegexOptions.None, new string[] { "aa" } }; - yield return new object[] { null, @"a{2,2}?", "aaaaa", RegexOptions.None, new string[] { "aa" } }; - yield return new object[] { null, @".{1,3}", "bb\nba", RegexOptions.None, new string[] { "bb" } }; - yield return new object[] { null, @".{1,3}?", "bb\nba", RegexOptions.None, new string[] { "b" } }; - yield return new object[] { null, @".{2,2}", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; - yield return new object[] { null, @".{2,2}?", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; - yield return new object[] { null, @"[abc]{1,3}", "ccaba", RegexOptions.None, new string[] { "cca" } }; - yield return new object[] { null, @"[abc]{1,3}?", "ccaba", RegexOptions.None, new string[] { "c" } }; - yield return new object[] { null, @"[abc]{2,2}", "ccaba", RegexOptions.None, new string[] { "cc" } }; - yield return new object[] { null, @"[abc]{2,2}?", "ccaba", RegexOptions.None, new string[] { "cc" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}?xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){1,3}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){2,2}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; - yield return new object[] { null, @"(?:[abc]def){2,2}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; - foreach (string prefix in new[] { "", "xyz" }) - { - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){2,2}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; - yield return new object[] { null, prefix + @"(?:[abc]def){2,2}?", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; - } - yield return new object[] { null, @"(cat){", "cat{", RegexOptions.None, new string[] { "cat{", "cat" } }; - yield return new object[] { null, @"(cat){}", "cat{}", RegexOptions.None, new string[] { "cat{}", "cat" } }; - yield return new object[] { null, @"(cat){,", "cat{,", RegexOptions.None, new string[] { "cat{,", "cat" } }; - yield return new object[] { null, @"(cat){,}", "cat{,}", RegexOptions.None, new string[] { "cat{,}", "cat" } }; - yield return new object[] { null, @"(cat){cat}", "cat{cat}", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; - yield return new object[] { null, @"(cat){cat,5}", "cat{cat,5}", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; - yield return new object[] { null, @"(cat){5,dog}", "cat{5,dog}", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; - yield return new object[] { null, @"(cat){cat,dog}", "cat{cat,dog}", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; - yield return new object[] { null, @"(cat){,}?", "cat{,}?", RegexOptions.None, new string[] { "cat{,}", "cat" } }; - yield return new object[] { null, @"(cat){cat}?", "cat{cat}?", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; - yield return new object[] { null, @"(cat){cat,5}?", "cat{cat,5}?", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; - yield return new object[] { null, @"(cat){5,dog}?", "cat{5,dog}?", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; - yield return new object[] { null, @"(cat){cat,dog}?", "cat{cat,dog}?", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; - - // Atomic subexpressions - // Implicitly upgrading (or not) oneloop to be atomic - yield return new object[] { null, @"a*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; - yield return new object[] { null, @"a*[bcd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[bcd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[bcd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*(?>[bcd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*[bcd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"a*([bcd]ab|[bef]cd){1,3}", "aaababecdcac", RegexOptions.ExplicitCapture, new string[] { "aaababecd" } }; - yield return new object[] { null, @"a*([bcd]|[aef]){1,3}", "befb", RegexOptions.ExplicitCapture, new string[] { "bef" } }; // can't upgrade - yield return new object[] { null, @"a*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; - yield return new object[] { null, @"a*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"a*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; - yield return new object[] { null, @"@*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; - yield return new object[] { null, @"@*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; - yield return new object[] { null, @"(?:abcd*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; - yield return new object[] { null, @"(?:abcd|efgh*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; - yield return new object[] { null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } }; - yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } }; - yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } }; - yield return new object[] { null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } }; - yield return new object[] { null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } }; - // Implicitly upgrading (or not) notoneloop to be atomic - yield return new object[] { null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[^b]*bac", "aaabac", RegexOptions.None, new string[] { "aaabac" } }; - yield return new object[] { null, @"[^b]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade - yield return new object[] { null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } }; - yield return new object[] { null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade - yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } }; - yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } }; - // Implicitly upgrading (or not) setloop to be atomic - yield return new object[] { null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; - yield return new object[] { null, @"[ac]*[bd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[bd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[bd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*(?>[bd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*[bd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; - yield return new object[] { null, @"[ac]*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"[ac]*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; - yield return new object[] { null, @"[@']*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; - yield return new object[] { null, @"[@']*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; - yield return new object[] { null, @".*.", "@@@", RegexOptions.Singleline, new string[] { "@@@" } }; - yield return new object[] { null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade - yield return new object[] { null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade - yield return new object[] { null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } }; - yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } }; - yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } }; - yield return new object[] { null, @"a[^wz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; - yield return new object[] { null, @"a[^wyz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; - yield return new object[] { null, @"a[^wyz]*W", "abcdcdcdWz", RegexOptions.IgnoreCase, new string[] { "abcdcdcdW" } }; - // Implicitly upgrading (or not) concat loops to be atomic - yield return new object[] { null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdfbcef", RegexOptions.None, new string[] { "acdfbcef" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "cdfbcef", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)+", "cdfbcef", RegexOptions.None, new string[] { "bcef" } }; - yield return new object[] { null, @"(?:[ab]c[de]f)*", "bcefbcdfacfe", RegexOptions.None, new string[] { "bcefbcdf" } }; - // Implicitly upgrading (or not) nested loops to be atomic - yield return new object[] { null, @"(?:a){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"(?:a){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; - yield return new object[] { null, @"(?:a{2}){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; - yield return new object[] { null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; - yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } }; - yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } }; - yield return new object[] { null, @"CN=(.*[^,]+).*", "CN=localhost", RegexOptions.Singleline, new string[] { "CN=localhost", "localhost" } }; - // Nested atomic - yield return new object[] { null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } }; - yield return new object[] { null, @"(?>(?:abc)*)", "abcabcabc", RegexOptions.None, new string[] { "abcabcabc" } }; - - // Anchoring loops beginning with .* / .+ - yield return new object[] { null, @".*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } }; - yield return new object[] { null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } }; - yield return new object[] { null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } }; - yield return new object[] { null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } }; - yield return new object[] { null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" }, "abc" }; // <-- Nonbacktracking match same as for "abc|.*123" - yield return new object[] { null, @"abc|.*123", "abc\n123", RegexOptions.Singleline, new string[] { "abc" } }; - yield return new object[] { null, @".*", "\n", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } }; - yield return new object[] { null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } }; - yield return new object[] { null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } }; - yield return new object[] { null, @".*", "abc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } }; - yield return new object[] { null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" }, "abcghi" }; // <-- Nonbacktracking match same as for ".*ghi|.*abc" - yield return new object[] { null, @".*ghi|.*abc", "abcghi", RegexOptions.None, new string[] { "abcghi" } }; - yield return new object[] { null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } }; - yield return new object[] { null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } }; - yield return new object[] { null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; - yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; - yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; - yield return new object[] { null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; - yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } }; - yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } }; - yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } }; - yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } }; - yield return new object[] { null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } }; - yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } }; - yield return new object[] { null, @".+", "a", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } }; - yield return new object[] { null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } }; - yield return new object[] { null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } }; - yield return new object[] { null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } }; - yield return new object[] { null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; - yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; - yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; - yield return new object[] { null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; - - // Unanchored .* - yield return new object[] { null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Name)", RegexOptions.None, new string[] { "Match(Name)", "(Name)", "Match", "Name" } }; - yield return new object[] { null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Na\nme)", RegexOptions.Singleline, new string[] { "Match(Na\nme)", "(Na\nme)", "Match", "Na\nme" } }; - foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline }) - { - yield return new object[] { null, @"abcd.*", @"abcabcd", options, new string[] { "abcd" } }; - yield return new object[] { null, @"abcd.*", @"abcabcde", options, new string[] { "abcde" } }; - yield return new object[] { null, @"abcd.*", @"abcabcdefg", options, new string[] { "abcdefg" } }; - yield return new object[] { null, @"abcd(.*)", @"ababcd", options, new string[] { "abcd", "" } }; - yield return new object[] { null, @"abcd(.*)", @"aabcde", options, new string[] { "abcde", "e" } }; - yield return new object[] { null, @"abcd(.*)", @"abcabcdefg", options, new string[] { "abcdefg", "efg" } }; - yield return new object[] { null, @"abcd(.*)e", @"abcabcdefg", options, new string[] { "abcde", "" } }; - yield return new object[] { null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } }; - } + yield return new object[] { engine, null, @"\G<%#(?.*?)?%>", @"<%# DataBinder.Eval(this, ""MyNumber"") %>", RegexOptions.Singleline, new string[] { @"<%# DataBinder.Eval(this, ""MyNumber"") %>", @" DataBinder.Eval(this, ""MyNumber"") " } }; + + // Nested Quantifiers + yield return new object[] { engine, null, @"^[abcd]{0,0x10}*$", "a{0,0x10}}}", RegexOptions.None, new string[] { "a{0,0x10}}}" } }; + + // Lazy operator Backtracking + yield return new object[] { engine, null, @"http://([a-zA-z0-9\-]*\.?)*?(:[0-9]*)??/", "http://www.msn.com/", RegexOptions.IgnoreCase, new string[] { "http://www.msn.com/", "com", string.Empty } }; + yield return new object[] { engine, null, @"http://([a-zA-Z0-9\-]*\.?)*?/", @"http://www.google.com/", RegexOptions.IgnoreCase, new string[] { "http://www.google.com/", "com" } }; + + yield return new object[] { engine, null, @"([a-z]*?)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "c", string.Empty, "c" } }; + yield return new object[] { engine, null, @"^([a-z]*?)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; + + // Backtracking + yield return new object[] { engine, null, @"([a-z]*)([\w])", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; + yield return new object[] { engine, null, @"^([a-z]*)([\w])$", "cat", RegexOptions.IgnoreCase, new string[] { "cat", "ca", "t" } }; + + // Backtracking with multiple (.*) groups -- important ASP.NET scenario + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/.aspx", RegexOptions.None, new string[] { "/.aspx", string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Backtracking with multiple (.+) groups + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Backtracking with (.+) group followed by (.*) + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "pages/.aspx", RegexOptions.None, new string[] { "pages/.aspx", "pages", string.Empty } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.+)/(.*)/(.*).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Backtracking with (.*) group followed by (.+) + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/homepage.aspx", RegexOptions.None, new string[] { "/homepage.aspx", string.Empty, "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "pages/homepage.aspx", RegexOptions.None, new string[] { "pages/homepage.aspx", "pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx", "/pages", "homepage" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage/index.aspx", RegexOptions.None, new string[] { "/pages/homepage/index.aspx", "/pages/homepage", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; + yield return new object[] { engine, null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + + // Quantifiers + yield return new object[] { engine, null, @"a*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a*", "a", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @"a*", "aa", RegexOptions.None, new string[] { "aa" } }; + yield return new object[] { engine, null, @"a*", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*?", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a*?", "a", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a*?", "aa", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"a+?", "aa", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @"a{1,", "a{1,", RegexOptions.None, new string[] { "a{1," } }; + yield return new object[] { engine, null, @"a{1,3}", "aaaaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a{1,3}?", "aaaaa", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @"a{2,2}", "aaaaa", RegexOptions.None, new string[] { "aa" } }; + yield return new object[] { engine, null, @"a{2,2}?", "aaaaa", RegexOptions.None, new string[] { "aa" } }; + yield return new object[] { engine, null, @".{1,3}", "bb\nba", RegexOptions.None, new string[] { "bb" } }; + yield return new object[] { engine, null, @".{1,3}?", "bb\nba", RegexOptions.None, new string[] { "b" } }; + yield return new object[] { engine, null, @".{2,2}", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; + yield return new object[] { engine, null, @".{2,2}?", "bbb\nba", RegexOptions.None, new string[] { "bb" } }; + yield return new object[] { engine, null, @"[abc]{1,3}", "ccaba", RegexOptions.None, new string[] { "cca" } }; + yield return new object[] { engine, null, @"[abc]{1,3}?", "ccaba", RegexOptions.None, new string[] { "c" } }; + yield return new object[] { engine, null, @"[abc]{2,2}", "ccaba", RegexOptions.None, new string[] { "cc" } }; + yield return new object[] { engine, null, @"[abc]{2,2}?", "ccaba", RegexOptions.None, new string[] { "cc" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}?xyz", "cdefxyz", RegexOptions.None, new string[] { "cdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){1,3}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "adefbdefcdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){2,2}xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; + yield return new object[] { engine, null, @"(?:[abc]def){2,2}?xyz", "adefbdefcdefxyz", RegexOptions.None, new string[] { "bdefcdefxyz" } }; + foreach (string prefix in new[] { "", "xyz" }) + { + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadefbdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdef", RegexOptions.None, new string[] { prefix + "cdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){1,3}?", prefix + "cdefadefbdef", RegexOptions.None, new string[] { prefix + "cdef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){2,2}", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; + yield return new object[] { engine, null, prefix + @"(?:[abc]def){2,2}?", prefix + "cdefadefbdefadef", RegexOptions.None, new string[] { prefix + "cdefadef" } }; + } + yield return new object[] { engine, null, @"(cat){", "cat{", RegexOptions.None, new string[] { "cat{", "cat" } }; + yield return new object[] { engine, null, @"(cat){}", "cat{}", RegexOptions.None, new string[] { "cat{}", "cat" } }; + yield return new object[] { engine, null, @"(cat){,", "cat{,", RegexOptions.None, new string[] { "cat{,", "cat" } }; + yield return new object[] { engine, null, @"(cat){,}", "cat{,}", RegexOptions.None, new string[] { "cat{,}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat}", "cat{cat}", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,5}", "cat{cat,5}", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; + yield return new object[] { engine, null, @"(cat){5,dog}", "cat{5,dog}", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,dog}", "cat{cat,dog}", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; + yield return new object[] { engine, null, @"(cat){,}?", "cat{,}?", RegexOptions.None, new string[] { "cat{,}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat}?", "cat{cat}?", RegexOptions.None, new string[] { "cat{cat}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,5}?", "cat{cat,5}?", RegexOptions.None, new string[] { "cat{cat,5}", "cat" } }; + yield return new object[] { engine, null, @"(cat){5,dog}?", "cat{5,dog}?", RegexOptions.None, new string[] { "cat{5,dog}", "cat" } }; + yield return new object[] { engine, null, @"(cat){cat,dog}?", "cat{cat,dog}?", RegexOptions.None, new string[] { "cat{cat,dog}", "cat" } }; + + // Atomic subexpressions + // Implicitly upgrading (or not) oneloop to be atomic + yield return new object[] { engine, null, @"a*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; + yield return new object[] { engine, null, @"a*[bcd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[bcd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[bcd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*(?>[bcd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*[bcd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"a*([bcd]ab|[bef]cd){1,3}", "aaababecdcac", RegexOptions.ExplicitCapture, new string[] { "aaababecd" } }; + yield return new object[] { engine, null, @"a*([bcd]|[aef]){1,3}", "befb", RegexOptions.ExplicitCapture, new string[] { "bef" } }; // can't upgrade + yield return new object[] { engine, null, @"a*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"a*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"@*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"@*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"(?:abcd*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; + yield return new object[] { engine, null, @"(?:abcd|efgh*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; + yield return new object[] { engine, null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } }; + yield return new object[] { engine, null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } }; + yield return new object[] { engine, null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } }; + yield return new object[] { engine, null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } }; + yield return new object[] { engine, null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } }; + // Implicitly upgrading (or not) notoneloop to be atomic + yield return new object[] { engine, null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[^b]*bac", "aaabac", RegexOptions.None, new string[] { "aaabac" } }; + yield return new object[] { engine, null, @"[^b]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade + yield return new object[] { engine, null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } }; + yield return new object[] { engine, null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade + yield return new object[] { engine, null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } }; + yield return new object[] { engine, null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } }; + // Implicitly upgrading (or not) setloop to be atomic + yield return new object[] { engine, null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*b+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*(?>b+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[^a]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[^a]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[^a]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*(?>[^a]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*bcd", "aaabcd", RegexOptions.None, new string[] { "aaabcd" } }; + yield return new object[] { engine, null, @"[ac]*[bd]", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[bd]+", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[bd]+?", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*(?>[bd]+)", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*[bd]{1,3}", "aaab", RegexOptions.None, new string[] { "aaab" } }; + yield return new object[] { engine, null, @"[ac]*$", "aaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*$", "aaa", RegexOptions.Multiline, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*\b", "aaa bbb", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[ac]*\b", "aaa bbb", RegexOptions.ECMAScript, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"[@']*\B", "@@@", RegexOptions.None, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"[@']*\B", "@@@", RegexOptions.ECMAScript, new string[] { "@@@" } }; + yield return new object[] { engine, null, @".*.", "@@@", RegexOptions.Singleline, new string[] { "@@@" } }; + yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade + yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade + yield return new object[] { engine, null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } }; + yield return new object[] { engine, null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } }; + yield return new object[] { engine, null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } }; + yield return new object[] { engine, null, @"a[^wz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; + yield return new object[] { engine, null, @"a[^wyz]*w", "abcdcdcdwz", RegexOptions.None, new string[] { "abcdcdcdw" } }; + yield return new object[] { engine, null, @"a[^wyz]*W", "abcdcdcdWz", RegexOptions.IgnoreCase, new string[] { "abcdcdcdW" } }; + // Implicitly upgrading (or not) concat loops to be atomic + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "acdfbcef", RegexOptions.None, new string[] { "acdfbcef" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "cdfbcef", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)+", "cdfbcef", RegexOptions.None, new string[] { "bcef" } }; + yield return new object[] { engine, null, @"(?:[ab]c[de]f)*", "bcefbcdfacfe", RegexOptions.None, new string[] { "bcefbcdf" } }; + // Implicitly upgrading (or not) nested loops to be atomic + yield return new object[] { engine, null, @"(?:a){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"(?:a){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaa" } }; + yield return new object[] { engine, null, @"(?:a{2}){3}", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; + yield return new object[] { engine, null, @"(?:a{2}?){3}?", "aaaaaaaaa", RegexOptions.None, new string[] { "aaaaaa" } }; + yield return new object[] { engine, null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } }; + yield return new object[] { engine, null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } }; + yield return new object[] { engine, null, @"CN=(.*[^,]+).*", "CN=localhost", RegexOptions.Singleline, new string[] { "CN=localhost", "localhost" } }; + // Nested atomic + yield return new object[] { engine, null, @"(?>abc[def]gh(i*))", "123abceghiii456", RegexOptions.None, new string[] { "abceghiii", "iii" } }; + yield return new object[] { engine, null, @"(?>(?:abc)*)", "abcabcabc", RegexOptions.None, new string[] { "abcabcabc" } }; + + // Anchoring loops beginning with .* / .+ + yield return new object[] { engine, null, @".*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } }; + yield return new object[] { engine, null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } }; + yield return new object[] { engine, null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } }; + yield return new object[] { engine, null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } }; + yield return new object[] { engine, null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" }, "abc" }; // <-- Nonbacktracking match same as for "abc|.*123" + yield return new object[] { engine, null, @"abc|.*123", "abc\n123", RegexOptions.Singleline, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*", "\n", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } }; + yield return new object[] { engine, null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } }; + yield return new object[] { engine, null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } }; + yield return new object[] { engine, null, @".*", "abc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } }; + yield return new object[] { engine, null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" }, "abcghi" }; // <-- Nonbacktracking match same as for ".*ghi|.*abc" + yield return new object[] { engine, null, @".*ghi|.*abc", "abcghi", RegexOptions.None, new string[] { "abcghi" } }; + yield return new object[] { engine, null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } }; + yield return new object[] { engine, null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } }; + yield return new object[] { engine, null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; + yield return new object[] { engine, null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; + yield return new object[] { engine, null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; + yield return new object[] { engine, null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; + yield return new object[] { engine, null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } }; + yield return new object[] { engine, null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } }; + yield return new object[] { engine, null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } }; + yield return new object[] { engine, null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } }; + yield return new object[] { engine, null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } }; + yield return new object[] { engine, null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } }; + yield return new object[] { engine, null, @".+", "a", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } }; + yield return new object[] { engine, null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } }; + yield return new object[] { engine, null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } }; + yield return new object[] { engine, null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } }; + yield return new object[] { engine, null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } }; + yield return new object[] { engine, null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } }; + yield return new object[] { engine, null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } }; + yield return new object[] { engine, null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } }; + + // Unanchored .* + yield return new object[] { engine, null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Name)", RegexOptions.None, new string[] { "Match(Name)", "(Name)", "Match", "Name" } }; + yield return new object[] { engine, null, @"\A\s*(?\w+)(\s*\((?.*)\))?\s*\Z", "Match(Na\nme)", RegexOptions.Singleline, new string[] { "Match(Na\nme)", "(Na\nme)", "Match", "Na\nme" } }; + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline }) + { + yield return new object[] { engine, null, @"abcd.*", @"abcabcd", options, new string[] { "abcd" } }; + yield return new object[] { engine, null, @"abcd.*", @"abcabcde", options, new string[] { "abcde" } }; + yield return new object[] { engine, null, @"abcd.*", @"abcabcdefg", options, new string[] { "abcdefg" } }; + yield return new object[] { engine, null, @"abcd(.*)", @"ababcd", options, new string[] { "abcd", "" } }; + yield return new object[] { engine, null, @"abcd(.*)", @"aabcde", options, new string[] { "abcde", "e" } }; + yield return new object[] { engine, null, @"abcd(.*)", @"abcabcdefg", options, new string[] { "abcdefg", "efg" } }; + yield return new object[] { engine, null, @"abcd(.*)e", @"abcabcdefg", options, new string[] { "abcde", "" } }; + yield return new object[] { engine, null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } }; + } - // Grouping Constructs Invalid Regular Expressions - yield return new object[] { null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; - yield return new object[] { null, @"(?)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; - yield return new object[] { null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; - yield return new object[] { null, @"(?:)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?imn)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?imn)cat", "(?imn)cat", RegexOptions.None, new string[] { "cat" } }; - yield return new object[] { null, @"(?=)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } }; - yield return new object[] { null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } }; - - // Alternation construct Invalid Regular Expressions - yield return new object[] { null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } }; - - yield return new object[] { null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } }; - - yield return new object[] { null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } }; - yield return new object[] { null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } }; - - yield return new object[] { null, @"(?(cat)|catdog)", "cat", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } }; - yield return new object[] { null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } }; - - // Invalid unicode - yield return new object[] { null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } }; - yield return new object[] { null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } }; - yield return new object[] { null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]]+", "bcxyzABCXYZ123890a", RegexOptions.None, new string[] { "a" } }; - yield return new object[] { null, "[\u0000-\uFFFF-[\\p{P}\\p{S}\\p{C}]]+", "!@`';.,$+<>=\x0001\x001FazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; - - yield return new object[] { null, @"[\uFFFD-\uFFFF]+", "\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFD\uFFFE\uFFFF" } }; - yield return new object[] { null, @"[\uFFFC-\uFFFE]+", "\uFFFB\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFC\uFFFD\uFFFE" } }; - - // Empty Match - yield return new object[] { null, @"([a*]*)+?$", "ab", RegexOptions.None, new string[] { "", "" } }; - yield return new object[] { null, @"(a*)+?$", "b", RegexOptions.None, new string[] { "", "" } }; + // Grouping Constructs Invalid Regular Expressions + yield return new object[] { engine, null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(?)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } }; + yield return new object[] { engine, null, @"(?:)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?imn)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?imn)cat", "(?imn)cat", RegexOptions.None, new string[] { "cat" } }; + yield return new object[] { engine, null, @"(?=)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } }; + yield return new object[] { engine, null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } }; + + // Alternation construct Invalid Regular Expressions + yield return new object[] { engine, null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } }; + + yield return new object[] { engine, null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } }; + + yield return new object[] { engine, null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } }; + yield return new object[] { engine, null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } }; + + yield return new object[] { engine, null, @"(?(cat)|catdog)", "cat", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } }; + yield return new object[] { engine, null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } }; + + // Invalid unicode + yield return new object[] { engine, null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } }; + yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } }; + yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]]+", "bcxyzABCXYZ123890a", RegexOptions.None, new string[] { "a" } }; + yield return new object[] { engine, null, "[\u0000-\uFFFF-[\\p{P}\\p{S}\\p{C}]]+", "!@`';.,$+<>=\x0001\x001FazAZ09", RegexOptions.None, new string[] { "azAZ09" } }; + + yield return new object[] { engine, null, @"[\uFFFD-\uFFFF]+", "\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFD\uFFFE\uFFFF" } }; + yield return new object[] { engine, null, @"[\uFFFC-\uFFFE]+", "\uFFFB\uFFFC\uFFFD\uFFFE\uFFFF", RegexOptions.IgnoreCase, new string[] { "\uFFFC\uFFFD\uFFFE" } }; + + // Empty Match + yield return new object[] { engine, null, @"([a*]*)+?$", "ab", RegexOptions.None, new string[] { "", "" } }; + yield return new object[] { engine, null, @"(a*)+?$", "b", RegexOptions.None, new string[] { "", "" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_enUS() { - yield return new object[] { "en-US", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; - yield return new object[] { "en-US", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; - yield return new object[] { "en-US", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; - yield return new object[] { "en-US", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; - yield return new object[] { "en-US", "\u0130", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; - yield return new object[] { "en-US", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "en-US", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + yield return new object[] { engine, "en-US", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + yield return new object[] { engine, "en-US", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + yield return new object[] { engine, "en-US", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + yield return new object[] { engine, "en-US", "\u0130", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; + yield return new object[] { engine, "en-US", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_Czech() { - yield return new object[] { "cs-CZ", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; - yield return new object[] { "cs-CZ", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "cs-CZ", "CH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + yield return new object[] { engine, "cs-CZ", "cH", "Ch", RegexOptions.IgnoreCase, new string[] { "Ch" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_Danish() { - yield return new object[] { "da-DK", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; - yield return new object[] { "da-DK", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "da-DK", "AA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + yield return new object[] { engine, "da-DK", "aA", "Aa", RegexOptions.IgnoreCase, new string[] { "Aa" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_Turkish() { - yield return new object[] { "tr-TR", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; - yield return new object[] { "tr-TR", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "tr-TR", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; + yield return new object[] { engine, "tr-TR", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + } } public static IEnumerable Groups_CustomCulture_TestData_AzeriLatin() { - if (PlatformDetection.IsNotBrowser) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - yield return new object[] { "az-Latn-AZ", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; - yield return new object[] { "az-Latn-AZ", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + if (PlatformDetection.IsNotBrowser) + { + yield return new object[] { engine, "az-Latn-AZ", "\u0131", "\u0049", RegexOptions.IgnoreCase, new string[] { "\u0049" } }; + yield return new object[] { engine, "az-Latn-AZ", "\u0130", "\u0069", RegexOptions.IgnoreCase, new string[] { "\u0069" } }; + } } } @@ -909,7 +927,7 @@ public static IEnumerable Groups_CustomCulture_TestData_AzeriLatin() [MemberData(nameof(Groups_CustomCulture_TestData_AzeriLatin))] [ActiveIssue("https://github.com/dotnet/runtime/issues/56407", TestPlatforms.Android)] [ActiveIssue("https://github.com/dotnet/runtime/issues/36900", TestPlatforms.iOS | TestPlatforms.tvOS | TestPlatforms.MacCatalyst)] - public async Task Groups(string cultureName, string pattern, string input, RegexOptions options, string[] expectedGroups, string altMatch = null) + public async Task Groups(RegexEngine engine, string cultureName, string pattern, string input, RegexOptions options, string[] expectedGroups, string altMatch = null) { if (cultureName is null) { @@ -917,59 +935,50 @@ public async Task Groups(string cultureName, string pattern, string input, Regex cultureName = culture.Equals(CultureInfo.InvariantCulture) ? "en-US" : culture.Name; } - using (new ThreadCultureChange(cultureName)) - { - foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - { - // Alternative altMatch when order of alternations matters in backtracking but order does not matter in NonBacktracking mode - // Also in NonBacktracking there is only a single top-level match, which is expectedGroups[0] when altMatch is null - string[] expected = engine == RegexEngine.NonBacktracking ? - new string[] { altMatch ?? expectedGroups[0] } : - expectedGroups; + // Alternative altMatch when order of alternations matters in backtracking but order does not matter in NonBacktracking mode + // Also in NonBacktracking there is only a single top-level match, which is expectedGroups[0] when altMatch is null + expectedGroups = engine == RegexEngine.NonBacktracking ? + new string[] { altMatch ?? expectedGroups[0] } : + expectedGroups; - await GroupsAsync(engine, pattern, input, options, expected); - } + if (engine == RegexEngine.NonBacktracking && pattern.Contains("?(cat)")) + { + // General if-then-else construct is not supported and uses the ?(cat) condition in the tests + // TODO-NONBACKTRACKING: The constructor will throw NotSupportedException so this check will become obsolete + return; } - static async Task GroupsAsync(RegexEngine engine, string pattern, string input, RegexOptions options, string[] expectedGroups) + using var _ = new ThreadCultureChange(cultureName); + + Regex regex; + try { - if (engine == RegexEngine.NonBacktracking && pattern.Contains("?(cat)")) - { - // General if-then-else construct is not supported and uses the ?(cat) condition in the tests - // TODO-NONBACKTRACKING: The constructor will throw NotSupportedException so this check will become obsolete - return; - } + regex = await RegexHelpers.GetRegexAsync(engine, pattern, options); + } + catch (NotSupportedException) when (RegexHelpers.IsNonBacktracking(engine)) + { + // Some constructs are not supported in NonBacktracking mode, such as: if-then-else, lookaround, and backreferences + return; + } - Regex regex; - try - { - regex = await RegexHelpers.GetRegexAsync(engine, pattern, options); - } - catch (NotSupportedException) when (RegexHelpers.IsNonBacktracking(engine)) - { - // Some constructs are not supported in NonBacktracking mode, such as: if-then-else, lookaround, and backreferences - return; - } + Match match = regex.Match(input); - Match match = regex.Match(input); + Assert.True(match.Success); + Assert.Equal(expectedGroups[0], match.Value); - Assert.True(match.Success); - Assert.Equal(expectedGroups[0], match.Value); + if (!RegexHelpers.IsNonBacktracking(engine)) + { + Assert.Equal(expectedGroups.Length, match.Groups.Count); - if (!RegexHelpers.IsNonBacktracking(engine)) + int[] groupNumbers = regex.GetGroupNumbers(); + string[] groupNames = regex.GetGroupNames(); + for (int i = 0; i < expectedGroups.Length; i++) { - Assert.Equal(expectedGroups.Length, match.Groups.Count); - - int[] groupNumbers = regex.GetGroupNumbers(); - string[] groupNames = regex.GetGroupNames(); - for (int i = 0; i < expectedGroups.Length; i++) - { - Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value); - Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]); - - Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i])); - Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i])); - } + Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value); + Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]); + + Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i])); + Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i])); } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index e7f0b4d298553..dc9381f954f91 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -239,6 +239,8 @@ public static IEnumerable Match_MemberData() { yield return ("aaa(?i:match this)bbb", "aaaMaTcH ThIsbbb", RegexOptions.None, 0, 16, true, "aaaMaTcH ThIsbbb"); } + yield return ("(?i:a)b(?i:c)d", "aaaaAbCdddd", RegexOptions.None, 0, 11, true, "AbCd"); + yield return ("(?i:[\u0000-\u1000])[Bb]", "aaaaAbCdddd", RegexOptions.None, 0, 11, true, "Ab"); // Turning off case insensitive option in mid-pattern : Actual - "aaa(?-i:match this)bbb", "i" yield return ("aAa(?-i:match this)bbb", "AaAmatch thisBBb", RegexOptions.IgnoreCase, 0, 16, true, "AaAmatch thisBBb"); @@ -516,6 +518,8 @@ public static IEnumerable Match_MemberData() yield return (@".*\dFoo", "This1foo should 2FoO match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 26, true, "This1foo should 2FoO"); yield return (@".*\dFoo", "This1Foo should 2fOo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 26, true, "This1Foo should 2fOo"); yield return (@".*\dfoo", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 8, 4, true, "2FOO"); + yield return (@"[\w\s].*", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 30, true, "1fooThis2FOO should 1foo match"); + yield return (@"i.*", "1fooThis2FOO should 1foo match", RegexOptions.IgnoreCase | RegexOptions.RightToLeft, 0, 30, true, "is2FOO should 1foo match"); } // [ActiveIssue("https://github.com/dotnet/runtime/issues/36149")] @@ -537,6 +541,29 @@ public static IEnumerable Match_MemberData() // yield return (@"^(?i:[\u24B6-\u24D0])$", ((char)('\u24CF' + 26)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u24CF' + 26)).ToString()); //} + // Long inputs + string longCharacterRange = string.Concat(Enumerable.Range(1, 0x2000).Select(c => (char)c)); + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.IgnoreCase }) + { + yield return ("\u1000", longCharacterRange, options, 0, 0x2000, true, "\u1000"); + yield return ("[\u1000-\u1001]", longCharacterRange, options, 0, 0x2000, true, "\u1000"); + yield return ("[\u0FF0-\u0FFF][\u1000-\u1001]", longCharacterRange, options, 0, 0x2000, true, "\u0FFF\u1000"); + + yield return ("\uA640", longCharacterRange, options, 0, 0x2000, false, ""); + yield return ("[\u3000-\u3001]", longCharacterRange, options, 0, 0x2000, false, ""); + yield return ("[\uA640-\uA641][\u3000-\u3010]", longCharacterRange, options, 0, 0x2000, false, ""); + + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return ("\u1000", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, true, "\u1000"); + yield return ("[\u1000-\u1001]", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, true, "\u1001"); + yield return ("[\u1000][\u1001-\u1010]", longCharacterRange, options, 0, 0x2000, true, "\u1000\u1001"); + + yield return ("\uA640", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, false, ""); + yield return ("[\u3000-\u3001][\uA640-\uA641]", longCharacterRange, options | RegexOptions.RightToLeft, 0, 0x2000, false, ""); + } + } + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Singleline }) { yield return (@"\W.*?\D", "seq 012 of 3 digits", options, 0, 19, true, " 012 "); @@ -1283,13 +1310,11 @@ public void Match_ExcessPrefix(RegexEngine engine) // Repeaters Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{2147483647,}")).IsMatch("a")); - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in debug - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{51,}")).IsMatch("a")); + Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50,}")).IsMatch("a")); Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_000,}")).IsMatch("a")); // cutoff for Boyer-Moore prefix in release - Assert.False((await RegexHelpers.GetRegexAsync(engine, @"a{50_001,}")).IsMatch("a")); // Multis - foreach (int length in new[] { 50, 51, 50_000, 50_001, char.MaxValue + 1 }) // based on knowledge of cut-offs used in Boyer-Moore + foreach (int length in new[] { 50, 50_000, char.MaxValue + 1 }) { // The large counters are too slow for counting a's in NonBacktracking engine // They will incur a constant of size length because in .*a{k} after reading n a's the diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs index b325f1c0bf2c7..e1792c623be21 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Tests.Common.cs @@ -48,11 +48,11 @@ public static bool IsDefaultStart(string input, RegexOptions options, int start) return start == 0; } - public static Regex CreateRegexInCulture(string pattern, RegexOptions options, Globalization.CultureInfo culture) + public static async Task GetRegexAsync(RegexEngine engine, string pattern, RegexOptions options, Globalization.CultureInfo culture) { using (new System.Tests.ThreadCultureChange(culture)) { - return new Regex(pattern, options); + return await GetRegexAsync(engine, pattern, options); } } @@ -116,7 +116,7 @@ public static async Task GetRegexAsync(RegexEngine engine, string pattern // - Handle NonBacktrackingSourceGenerated return - options is null ? new Regex(pattern, RegexOptions.Compiled | OptionsFromEngine(engine)) : + options is null ? new Regex(pattern, OptionsFromEngine(engine)) : matchTimeout is null ? new Regex(pattern, options.Value | OptionsFromEngine(engine)) : new Regex(pattern, options.Value | OptionsFromEngine(engine), matchTimeout.Value); } @@ -136,7 +136,7 @@ public static async Task GetRegexesAsync(RegexEngine engine, params (st { (string pattern, RegexOptions? options, TimeSpan? matchTimeout) = regexes[i]; results[i] = - options is null ? new Regex(pattern, RegexOptions.Compiled | OptionsFromEngine(engine)) : + options is null ? new Regex(pattern, OptionsFromEngine(engine)) : matchTimeout is null ? new Regex(pattern, options.Value | OptionsFromEngine(engine)) : new Regex(pattern, options.Value | OptionsFromEngine(engine), matchTimeout.Value); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs index adcde90c42b97..028afabe61d9e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -12,62 +12,43 @@ namespace System.Text.RegularExpressions.Tests { public class RegexCultureTests { - // TODO: Validate source generator after figuring out what to do with culture - - public static IEnumerable RegexOptionsExtended() - { - yield return RegexOptions.None; - yield return RegexOptions.Compiled; - if (PlatformDetection.IsNetCore) - { - yield return RegexHelpers.RegexOptionNonBacktracking; - } - } - - public static IEnumerable RegexOptionsExtended_MemberData() => - from options in RegexOptionsExtended() - select new object[] { options }; - public static IEnumerable CharactersComparedOneByOne_AnchoredPattern_TestData() { - foreach (RegexOptions options in RegexOptionsExtended()) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - yield return new object[] { "^aa$", "aA", "da-DK", options, false }; - yield return new object[] { "^aA$", "aA", "da-DK", options, true }; - yield return new object[] { "^aa$", "aA", "da-DK", options | RegexOptions.IgnoreCase, true }; - yield return new object[] { "^aA$", "aA", "da-DK", options | RegexOptions.IgnoreCase, true }; + yield return new object[] { engine, "^aa$", "aA", "da-DK", RegexOptions.None, false }; + yield return new object[] { engine, "^aA$", "aA", "da-DK", RegexOptions.None, true }; + yield return new object[] { engine, "^aa$", "aA", "da-DK", RegexOptions.IgnoreCase, true }; + yield return new object[] { engine, "^aA$", "aA", "da-DK", RegexOptions.IgnoreCase, true }; } } [Theory] [MemberData(nameof(CharactersComparedOneByOne_AnchoredPattern_TestData))] - public void CharactersComparedOneByOne_AnchoredPattern(string pattern, string input, string culture, RegexOptions options, bool expected) + public async Task CharactersComparedOneByOne_AnchoredPattern(RegexEngine engine, string pattern, string input, string culture, RegexOptions options, bool expected) { // Regex compares characters one by one. If that changes, it could impact the behavior of // a case like this, where these characters are not the same, but the strings compare // as equal with the invariant culture (and some other cultures as well). using (new ThreadCultureChange(culture)) { - foreach (RegexOptions compiled in new[] { RegexOptions.None, RegexOptions.Compiled }) - { - Assert.Equal(expected, new Regex(pattern, options | compiled).IsMatch(input)); - } + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, options); + Assert.Equal(expected, r.IsMatch(input)); } } - public static IEnumerable CharactersComparedOneByOne_Invariant_TestData() { - foreach (RegexOptions options in RegexOptionsExtended()) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - yield return new object[] { options }; - yield return new object[] { options | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant }; + yield return new object[] { engine, RegexOptions.None }; + yield return new object[] { engine, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant }; } } [Theory] [MemberData(nameof(CharactersComparedOneByOne_Invariant_TestData))] - public void CharactersComparedOneByOne_Invariant(RegexOptions options) + public async Task CharactersComparedOneByOne_Invariant(RegexEngine engine, RegexOptions options) { // Regex compares characters one by one. If that changes, it could impact the behavior of // a case like this, where these characters are not the same, but the strings compare @@ -88,35 +69,20 @@ public void CharactersComparedOneByOne_Invariant(RegexOptions options) string input = string.Concat(Enumerable.Repeat(S2, multiple)); Regex r; - // Validate when the string is at the beginning of the pattern, as it impacts Boyer-Moore prefix matching. - r = new Regex(pattern, options); + // Validate when the string is at the beginning of the pattern, as it impacts prefix matching. + r = await RegexHelpers.GetRegexAsync(engine, pattern, options); Assert.False(r.IsMatch(input)); Assert.True(r.IsMatch(pattern)); // Validate when it's not at the beginning of the pattern, as it impacts "multi" matching. - r = new Regex("[abc]" + pattern, options); + r = await RegexHelpers.GetRegexAsync(engine, "[abc]" + pattern, options); Assert.False(r.IsMatch("a" + input)); Assert.True(r.IsMatch("a" + pattern)); } } - public static IEnumerable CharactersLowercasedOneByOne_MemberData() - { - foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - { - switch (engine) - { - case RegexEngine.SourceGenerated: - case RegexEngine.NonBacktrackingSourceGenerated: - continue; - } - - yield return new object[] { engine }; - } - } - [Theory] - [MemberData(nameof(CharactersLowercasedOneByOne_MemberData))] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] public async Task CharactersLowercasedOneByOne(RegexEngine engine) { using (new ThreadCultureChange("en-US")) @@ -191,15 +157,15 @@ Regex[] Create(string input, CultureInfo info, RegexOptions additional) [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Doesn't support NonBacktracking")] [Fact] [ActiveIssue("https://github.com/dotnet/runtime/issues/60568", TestPlatforms.Android)] - public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktracking() + public async Task TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktracking() { var turkish = new CultureInfo("tr-TR"); string input = "I\u0131\u0130i"; // Use the input as the regex also // Ignore the Compiled option here because it is a noop in combination with NonBacktracking - Regex cultInvariantRegex = RegexHelpers.CreateRegexInCulture(input, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, CultureInfo.InvariantCulture); - Regex turkishRegex = RegexHelpers.CreateRegexInCulture(input, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.IgnoreCase, turkish); + Regex cultInvariantRegex = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, input, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, CultureInfo.InvariantCulture); + Regex turkishRegex = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, input, RegexOptions.IgnoreCase, turkish); Assert.True(cultInvariantRegex.IsMatch(input)); Assert.True(turkishRegex.IsMatch(input)); // <---------- This result differs from the result in the previous test!!! @@ -220,60 +186,70 @@ public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture_NonBacktr Assert.True(turkishRegex.IsMatch(input.ToUpper(turkish))); } - [ActiveIssue("Incorrect handling of IgnoreCase over intervals in Turkish Culture, https://github.com/dotnet/runtime/issues/58958")] - [Fact] - public void TurkishCulture_Handling_Of_IgnoreCase() + [ActiveIssue("https://github.com/dotnet/runtime/issues/58958")] + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task TurkishCulture_Handling_Of_IgnoreCase(RegexEngine engine) { var turkish = new CultureInfo("tr-TR"); string input = "I\u0131\u0130i"; string pattern = "[H-J][\u0131-\u0140][\u0120-\u0130][h-j]"; - Regex regex = RegexHelpers.CreateRegexInCulture(pattern, RegexOptions.IgnoreCase, turkish); + Regex regex = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.IgnoreCase, turkish); // The pattern must trivially match the input because all of the letters fall in the given intervals // Ignoring case can only add more letters here -- not REMOVE letters Assert.True(regex.IsMatch(input)); } - [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Doesn't support NonBacktracking")] - [Fact] - public void TurkishCulture_Handling_Of_IgnoreCase_NonBacktracking() + public static IEnumerable TurkishCulture_MatchesWordChar_MemberData() { - var turkish = new CultureInfo("tr-TR"); - string input = "I\u0131\u0130i"; - string pattern = "[H-J][\u0131-\u0140][\u0120-\u0130][h-j]"; - - Regex regex = RegexHelpers.CreateRegexInCulture(pattern, RegexOptions.IgnoreCase | RegexHelpers.RegexOptionNonBacktracking, turkish); + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.None, "I\u0131\u0130i" }; + yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.IgnoreCase, "I\u0131\u0130i" }; + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return new object[] { engine, "I\u0131\u0130i", RegexOptions.IgnoreCase | RegexOptions.ECMAScript, "" }; + } + } + } - // The pattern must trivially match the input because all of the letters fall in the given intervals - // Ignoring case can only add more letters here -- not REMOVE letters - Assert.True(regex.IsMatch(input)); + [Theory] + [MemberData(nameof(TurkishCulture_MatchesWordChar_MemberData))] + public async Task TurkishCulture_MatchesWordChar(RegexEngine engine, string input, RegexOptions options, string expectedResult) + { + using (new ThreadCultureChange(new CultureInfo("tr-TR"))) + { + Regex regex = await RegexHelpers.GetRegexAsync(engine, @"\w*", options); + Assert.Equal(expectedResult, regex.Match(input).Value); + } } public static IEnumerable Match_In_Different_Cultures_TestData() { CultureInfo invariant = CultureInfo.InvariantCulture; - CultureInfo current = CultureInfo.CurrentCulture; + CultureInfo enUS = new CultureInfo("en-US"); CultureInfo turkish = new CultureInfo("tr-TR"); - foreach (RegexOptions options in RegexOptionsExtended()) + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { // \u0130 (Turkish I with dot) and \u0131 (Turkish i without dot) are unrelated characters in general // Expected answers in the default en-US culture - yield return new object[] { "(?i:I)", options, current, "xy\u0131ab", "" }; - yield return new object[] { "(?i:iI+)", options, current, "abcIIIxyz", "III" }; - yield return new object[] { "(?i:iI+)", options, current, "abcIi\u0130xyz", "Ii\u0130" }; - yield return new object[] { "(?i:iI+)", options, current, "abcI\u0130ixyz", "I\u0130i" }; - yield return new object[] { "(?i:iI+)", options, current, "abc\u0130IIxyz", "\u0130II" }; - yield return new object[] { "(?i:iI+)", options, current, "abc\u0130\u0131Ixyz", "" }; - yield return new object[] { "(?i:iI+)", options, current, "abc\u0130Iixyz", "\u0130Ii" }; - yield return new object[] { "(?i:[^IJKLM]I)", options, current, "ii\u0130i\u0131ab", "" }; + yield return new object[] { "(?i:I)", engine, enUS, "xy\u0131ab", "" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abcIIIxyz", "III" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abcIi\u0130xyz", "Ii\u0130" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abcI\u0130ixyz", "I\u0130i" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130IIxyz", "\u0130II" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130\u0131Ixyz", "" }; + yield return new object[] { "(?i:iI+)", engine, enUS, "abc\u0130Iixyz", "\u0130Ii" }; + yield return new object[] { "(?i:[^IJKLM]I)", engine, enUS, "ii\u0130i\u0131ab", "" }; // Expected answers in the invariant culture - yield return new object[] { "(?i:I)", options, invariant, "xy\u0131ab", "" }; - yield return new object[] { "(?i:iI+)", options, invariant, "abcIIIxyz", "III" }; - yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130\u0131Ixyz", "" }; + yield return new object[] { "(?i:I)", engine, invariant, "xy\u0131ab", "" }; + yield return new object[] { "(?i:iI+)", engine, invariant, "abcIIIxyz", "III" }; + yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130\u0131Ixyz", "" }; // Expected answers in the Turkish culture // @@ -281,17 +257,17 @@ public static IEnumerable Match_In_Different_Cultures_TestData() // https://github.com/dotnet/runtime/issues/60568 if (!PlatformDetection.IsAndroid) { - yield return new object[] { "(?i:I)", options, turkish, "xy\u0131ab", "\u0131" }; - yield return new object[] { "(?i:iI+)", options, turkish, "abcIIIxyz", "" }; - yield return new object[] { "(?i:iI+)", options, turkish, "abcIi\u0130xyz", "" }; - yield return new object[] { "(?i:iI+)", options, turkish, "abcI\u0130ixyz", "" }; - yield return new object[] { "(?i:[^IJKLM]I)", options, turkish, "ii\u0130i\u0131ab", "i\u0131" }; + yield return new object[] { "(?i:I)", engine, turkish, "xy\u0131ab", "\u0131" }; + yield return new object[] { "(?i:iI+)", engine, turkish, "abcIIIxyz", "" }; + yield return new object[] { "(?i:iI+)", engine, turkish, "abcIi\u0130xyz", "" }; + yield return new object[] { "(?i:iI+)", engine, turkish, "abcI\u0130ixyz", "" }; + yield return new object[] { "(?i:[^IJKLM]I)", engine, turkish, "ii\u0130i\u0131ab", "i\u0131" }; } // None and Compiled are separated into the Match_In_Different_Cultures_CriticalCases test - if (options == RegexHelpers.RegexOptionNonBacktracking) + if (RegexHelpers.IsNonBacktracking(engine)) { - foreach (object[] data in Match_In_Different_Cultures_CriticalCases_TestData_For(options)) + foreach (object[] data in Match_In_Different_Cultures_CriticalCases_TestData_For(engine)) { yield return data; } @@ -299,39 +275,39 @@ public static IEnumerable Match_In_Different_Cultures_TestData() } } - public static IEnumerable Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions options) + public static IEnumerable Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine engine) { CultureInfo invariant = CultureInfo.InvariantCulture; CultureInfo turkish = new CultureInfo("tr-TR"); // Expected answers in the invariant culture - yield return new object[] { "(?i:iI+)", options, invariant, "abcIi\u0130xyz", "Ii" }; // <-- failing for None, Compiled - yield return new object[] { "(?i:iI+)", options, invariant, "abcI\u0130ixyz", "" }; // <-- failing for Compiled - yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130IIxyz", "II" }; // <-- failing for Compiled - yield return new object[] { "(?i:iI+)", options, invariant, "abc\u0130Iixyz", "Ii" }; // <-- failing for Compiled - yield return new object[] { "(?i:[^IJKLM]I)", options, invariant, "ii\u0130i\u0131ab", "\u0130i" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abcIi\u0130xyz", "Ii" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abcI\u0130ixyz", "" }; // <-- failing for Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130IIxyz", "II" }; // <-- failing for Compiled + yield return new object[] { "(?i:iI+)", engine, invariant, "abc\u0130Iixyz", "Ii" }; // <-- failing for Compiled + yield return new object[] { "(?i:[^IJKLM]I)", engine, invariant, "ii\u0130i\u0131ab", "\u0130i" }; // <-- failing for None, Compiled // Expected answers in the Turkish culture // Android produces unexpected results for tr-TR // https://github.com/dotnet/runtime/issues/60568 if (!PlatformDetection.IsAndroid) { - yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130IIxyz", "\u0130II" }; // <-- failing for None, Compiled - yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130\u0131Ixyz", "\u0130\u0131I" }; // <-- failing for None, Compiled - yield return new object[] { "(?i:iI+)", options, turkish, "abc\u0130Iixyz", "\u0130I" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130IIxyz", "\u0130II" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130\u0131Ixyz", "\u0130\u0131I" }; // <-- failing for None, Compiled + yield return new object[] { "(?i:iI+)", engine, turkish, "abc\u0130Iixyz", "\u0130I" }; // <-- failing for None, Compiled } } public static IEnumerable Match_In_Different_Cultures_CriticalCases_TestData() => - Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions.None).Union(Match_In_Different_Cultures_CriticalCases_TestData_For(RegexOptions.Compiled)); + Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine.Interpreter).Union(Match_In_Different_Cultures_CriticalCases_TestData_For(RegexEngine.Compiled)); [ActiveIssue("https://github.com/dotnet/runtime/issues/60899", TestPlatforms.Browser)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/60697", TestPlatforms.iOS | TestPlatforms.tvOS)] [Theory] [MemberData(nameof(Match_In_Different_Cultures_TestData))] - [ActiveIssue("https://github.com/dotnet/runtime/issues/60697", TestPlatforms.iOS | TestPlatforms.tvOS)] - public void Match_In_Different_Cultures(string pattern, RegexOptions options, CultureInfo culture, string input, string match_expected) + public async Task Match_In_Different_Cultures(string pattern, RegexEngine engine, CultureInfo culture, string input, string match_expected) { - Regex r = RegexHelpers.CreateRegexInCulture(pattern, options, culture); + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.None, culture); Match match = r.Match(input); Assert.Equal(match_expected, match.Value); } @@ -339,9 +315,9 @@ public void Match_In_Different_Cultures(string pattern, RegexOptions options, Cu [ActiveIssue("Incorrect treatment of IgnoreCase in Turkish and Invariant cultures, https://github.com/dotnet/runtime/issues/58956, https://github.com/dotnet/runtime/issues/58958 ")] [Theory] [MemberData(nameof(Match_In_Different_Cultures_CriticalCases_TestData))] - public void Match_In_Different_Cultures_CriticalCases(string pattern, RegexOptions options, CultureInfo culture, string input, string match_expected) + public async Task Match_In_Different_Cultures_CriticalCases(string pattern, RegexEngine engine, CultureInfo culture, string input, string match_expected) { - Regex r = RegexHelpers.CreateRegexInCulture(pattern, options, culture); + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, RegexOptions.None, culture); Match match = r.Match(input); Assert.Equal(match_expected, match.Value); } @@ -367,9 +343,8 @@ public void Match_InvariantCulture_None_vs_Compiled() ///
[OuterLoop("May take several seconds due to large number of cultures tested")] [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)] - [Theory] - [MemberData(nameof(RegexOptionsExtended_MemberData))] - public void TestIgnoreCaseRelation(RegexOptions options) + [Fact] + public void TestIgnoreCaseRelation() { // these 22 characters are considered case-insensitive by regex, while they are case-sensitive outside regex // but they are only case-sensitive in an asymmmetrical way: tolower(c)=c, tolower(toupper(c)) != c @@ -380,10 +355,10 @@ public void TestIgnoreCaseRelation(RegexOptions options) { char cU = char.ToUpper(c); Assert.NotEqual(c, cU); - Assert.False(Regex.IsMatch(c.ToString(), cU.ToString(), options | RegexOptions.IgnoreCase)); + Assert.False(Regex.IsMatch(c.ToString(), cU.ToString(), RegexOptions.IgnoreCase)); } - Assert.False(Regex.IsMatch(Turkish_i_withoutDot.ToString(), "i", options | RegexOptions.IgnoreCase)); + Assert.False(Regex.IsMatch(Turkish_i_withoutDot.ToString(), "i", RegexOptions.IgnoreCase)); // as baseline it is assumed the the invariant culture does not change HashSet[] inv_table = ComputeIgnoreCaseTable(CultureInfo.InvariantCulture, treatedAsCaseInsensitive); diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs index ee6339561b8bc..3f27aab3d89b8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs @@ -33,11 +33,14 @@ public class RegexExperiment /// Output directory for generated dgml files. private static string DgmlOutputDirectoryPath => Path.Combine(s_tmpWorkingDir, "dgml"); - private static string ExperimentDirectoryPath => Path.Combine(s_tmpWorkingDir, "experiments"); - - [ConditionalFact(nameof(Enabled))] + [Fact] public void RegenerateUnicodeTables() { + if (!Enabled) + { + return; + } + MethodInfo? genUnicode = typeof(Regex).GetMethod("GenerateUnicodeTables", BindingFlags.NonPublic | BindingFlags.Static); // GenerateUnicodeTables is not available in Release build if (genUnicode is not null) @@ -46,9 +49,6 @@ public void RegenerateUnicodeTables() } } - private static void WriteOutput(string message) => - File.AppendAllText(OutputFilePath, message); - /// Save the regex as a DFA in DGML format in the textwriter. private static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hideStateInfo = false, bool addDotStar = false, bool inReverse = false, bool onlyDFAinfo = false, int maxLabelLength = -1, bool asNFA = false) { @@ -85,60 +85,6 @@ internal static void ViewDGML(Regex regex, int bound = -1, bool hideStateInfo = } } - /// - /// The intent is that this method is run in realease build for lightweight performance testing. - /// One can e.g. open the outputfile in emacs with AUTO-REVERT-ON in order to follow the progress in real time. - /// It will print timing info and match info for both DFA, Compiled option and None. - /// Place sample regexes in the regexesfile (one per line) and sample input in inputfile. - /// It will essentially produce a csv file with the info: - /// regexnr, matchtime_DFA, result_DFA, matchtime_Compiled, result_Compiled, matchtime_None, result_None, - /// where result_.. is one of - /// Yes(index,length) - /// No - /// TIMEOUT - /// ERROR - /// and in the case of TIMEOUT or ERROR time is 10000 (the timeout limit of 10sec) - /// - [ConditionalFact(nameof(Enabled))] - public void TestRunPerformance() - { - if (!Directory.Exists(ExperimentDirectoryPath)) - { - Directory.CreateDirectory(ExperimentDirectoryPath); - } - - string[] dirs = Directory.GetDirectories(ExperimentDirectoryPath); - if (dirs.Length == 0) - { - WriteOutput("\nExperiments directory is empty"); - return; - } - - DirectoryInfo experimentDI = Directory.GetParent(dirs[0]); - DirectoryInfo[] experiments = - Array.FindAll(experimentDI.GetDirectories(), - di => ((di.Attributes & FileAttributes.Hidden) != (FileAttributes.Hidden)) && - Array.Exists(di.GetFiles(), f => f.Name.Equals("regexes.txt")) && - Array.Exists(di.GetFiles(), f => f.Name.Equals("input.txt"))); - if (experiments.Length == 0) - { - WriteOutput("\nExperiments directory has no indiviual experiment subdirectories containing files 'regexes.txt' and 'input.txt'."); - return; - } - - for (int i = 0; i < experiments.Length; i++) - { - string input = File.ReadAllText(Path.Combine(experiments[i].FullName, "input.txt")); - string[] rawRegexes = File.ReadAllLines(Path.Combine(experiments[i].FullName, "regexes.txt")); - - WriteOutput($"\n---------- {experiments[i].Name} ----------"); - for (int r = 0; r < rawRegexes.Length; r++) - { - TestRunRegex((r + 1).ToString(), rawRegexes[r], input); - } - } - } - private static long MeasureMatchTime(Regex re, string input, out Match match) { try @@ -178,9 +124,14 @@ private static string And(params string[] regexes) /// private static string Not(string regex) => $"(?({regex})[0-[0]]|.*)"; - [ConditionalFact(nameof(Enabled))] + [Fact] public void ViewSampleRegexInDGML() { + if (!Enabled) + { + return; + } + try { //string rawregex = @"\bis\w*\b"; @@ -233,45 +184,6 @@ static bool TrySaveDGML(Regex regex, TextWriter writer, int bound = -1, bool hid } } - private void TestRunRegex(string name, string rawregex, string input, bool viewDGML = false, bool dotStar = false) - { - var reNone = new Regex(rawregex, RegexOptions.None, new TimeSpan(0, 0, 10)); - var reCompiled = new Regex(rawregex, RegexOptions.Compiled, new TimeSpan(0, 0, 10)); - var reNonBacktracking = new Regex(rawregex, RegexOptions.NonBacktracking); - - if (viewDGML) - ViewDGML(reNonBacktracking, addDotStar: dotStar); - WriteOutput($"\n{name}"); - - // First call in each case is a warmup - - // None - MeasureMatchTime(reNone, input, out _); - long tN = MeasureMatchTime(reNone, input, out Match mN); - WriteMatchOutput(tN, mN); - - // Compiled - MeasureMatchTime(reCompiled, input, out _); - long tC = MeasureMatchTime(reCompiled, input, out Match mC); - WriteMatchOutput(tC, mC); - - // Non-Backtracking - MeasureMatchTime(reNonBacktracking, input, out _); - long tD = MeasureMatchTime(reNonBacktracking, input, out Match mD); - WriteMatchOutput(tD, mD); - - void WriteMatchOutput(long t, Match m) - { - WriteOutput(t switch - { - -1 => ",10000,TIMEOUT", - -2 => ",10000,ERROR", - _ when m.Success => $",{t},Yes({m.Index}:{m.Length})", - _ => $",{t},No" - }); - } - } - #region Tests involving Intersection and Complement // Currently only run in DEBUG mode in the NonBacktracking engine [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index ad5ca8d0754d9..492acbc5b7f86 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -363,6 +363,7 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?i:abcde)|(?i:abcdf)", "(?i:abcd[ef])")] [InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")] [InlineData("bonjour|hej|ciao|shalom|zdravo|pozdrav|hallo|hola|hello|hey|witam|tere|bonjou|salam|helo|sawubona", "(?>bonjou(?>r|)|h(?>e(?>j|(?>l(?>lo|o)|y))|allo|ola)|ciao|s(?>halom|a(?>lam|wubona))|zdravo|pozdrav|witam|tere)")] + [InlineData("\\w\\d123|\\w\\dabc", "\\w\\d(?:123|abc)")] // Auto-atomicity [InlineData("a*b", "(?>a*)b")] [InlineData("a*b+", "(?>a*)b+")] @@ -384,6 +385,16 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?:w*)+\\.", "(?>w*)+\\.")] [InlineData("(a[bcd]e*)*fg", "(a[bcd](?>e*))*fg")] [InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")] + // IgnoreCase set creation + [InlineData("(?i)abcd", "[Aa][Bb][Cc][Dd]")] + [InlineData("(?i)abcd|efgh", "[Aa][Bb][Cc][Dd]|[Ee][Ff][Gg][Hh]")] + [InlineData("(?i)a|b", "[AaBb]")] + [InlineData("(?i)[abcd]", "[AaBbCcDd]")] + [InlineData("(?i)[acexyz]", "[AaCcEeXxYyZz]")] + [InlineData("(?i)\\w", "\\w")] + [InlineData("(?i)\\d", "\\d")] + [InlineData("(?i).", ".")] + [InlineData("(?i)\\$", "\\$")] public void PatternsReduceIdentically(string pattern1, string pattern2) { string result1 = GetRegexCodes(new Regex(pattern1)); @@ -394,10 +405,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) } Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2))); - if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:")) - { - Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2))); - } } [Theory] From 61e3cef41b0f6819d3a183dabde3dc4fe5667bb8 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 12 Nov 2021 11:18:17 -0500 Subject: [PATCH 2/6] Fix missing condition in RegexCompiler --- .../src/System/Text/RegularExpressions/RegexCompiler.cs | 2 +- .../System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index cfafa2656677d..ccca89e3a3666 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1584,7 +1584,7 @@ void GenerateFixedSet_LeftToRight() Ldloc(iLocal); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - if (setsToUse > 1) + if (setsToUse > 1 || primarySet.Distance != 0) { Ldc(minRequiredLength - 1); Sub(); diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index dc9381f954f91..5da3f4fcaa331 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -276,6 +276,8 @@ public static IEnumerable Match_MemberData() yield return (@"\p{Ll}", "1bc", RegexOptions.IgnoreCase, 0, 3, true, "b"); yield return (@"\p{Lt}", "1bc", RegexOptions.IgnoreCase, 0, 3, true, "b"); yield return (@"\p{Lo}", "1bc", RegexOptions.IgnoreCase, 0, 3, false, string.Empty); + yield return (".[abc]", "xYZAbC", RegexOptions.IgnoreCase, 0, 6, true, "ZA"); + yield return (".[abc]", "xYzXyZx", RegexOptions.IgnoreCase, 0, 6, false, ""); // "\D+" yield return (@"\D+", "12321", RegexOptions.None, 0, 5, false, string.Empty); @@ -362,7 +364,6 @@ public static IEnumerable Match_MemberData() yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); } yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); - yield return (@"a\w*a|def", "aaaaa", RegexOptions.None, 0, 5, true, "aaaaa"); // No Negation yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty); From 66795364d3ab94274a21b1ad40717616451e1a31 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sat, 13 Nov 2021 19:42:57 -0500 Subject: [PATCH 3/6] Try to fix mono failures and address comment feedback --- .../Text/RegularExpressions/RegexCharClass.cs | 6 +++--- .../Text/RegularExpressions/RegexCompiler.cs | 12 ++++++------ .../RegularExpressions/RegexFindOptimizations.cs | 14 ++------------ .../tests/RegexReductionTests.cs | 1 - 4 files changed, 11 insertions(+), 22 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index c60ea9b04de15..0b6dafbebb3d2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -876,8 +876,8 @@ public static bool ParticipatesInCaseConversion(int comparison) } } - /// Gets whether the specified string participates in case conversion. - /// The string participates in case conversion if any of its characters do. + /// Gets whether the specified span participates in case conversion. + /// The span participates in case conversion if any of its characters do. public static bool ParticipatesInCaseConversion(ReadOnlySpan s) { foreach (char c in s) @@ -1023,7 +1023,7 @@ public static bool IsWordChar(char ch) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool CharInClass(char ch, string set, ref uint[]? asciiLazyCache) { - // The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit in the pair + // The uint[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit in the pair // says whether the second bit in the pair has already been computed. Once a value is computed, it's never // changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit // has been set. If the known bit hasn't been set, then we proceed to look it up, and then swap in the result. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index ccca89e3a3666..fd2290b0d9b27 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1318,8 +1318,6 @@ void GenerateFixedSet_RightToLeft() using RentedLocalBuilder i = RentInt32Local(); - Mvfldloc(s_runtextField, _runtextLocal); - if (set.Chars is { Length: 1 } && !set.CaseInsensitive) { // int i = runtext.AsSpan(runtextpos, runtextbeg, runtextpos - runtextbeg).LastIndexOf(set.Chars[0]); @@ -1357,12 +1355,14 @@ void GenerateFixedSet_RightToLeft() Label increment = DefineLabel(); Label body = DefineLabel(); + Mvfldloc(s_runtextField, _runtextLocal); + // for (int i = runtextpos - 1; ... Ldloc(_runtextposLocal); Ldc(1); Sub(); Stloc(i); - Br(condition); + BrFar(condition); // if (MatchCharClass(runtext[i], set)) MarkLabel(body); @@ -1393,10 +1393,10 @@ void GenerateFixedSet_RightToLeft() MarkLabel(condition); Ldloc(i); Ldloc(_runtextbegLocal!); - Bge(body); - } + BgeFar(body); - BrFar(returnFalse); + BrFar(returnFalse); + } } void GenerateFixedSet_LeftToRight() diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index adab678bba7dd..f1b285818e93e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -113,12 +113,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) (true, false) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive, (true, true) => FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive, }; - - // Non-compiled code will be using CharInClass with a cache, so initialize that cache. - if (!compiled) - { - _asciiLookups = new uint[FixedDistanceSets.Count][]; - } + _asciiLookups = new uint[1][]; } } return; @@ -163,12 +158,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) (false, true) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive, (false, false) => FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive, }; - - // Non-compiled code will be using CharInClass with a cache, so initialize that cache. - if (!compiled) - { - _asciiLookups = new uint[FixedDistanceSets.Count][]; - } + _asciiLookups = new uint[fixedDistanceSets.Count][]; } return; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index 492acbc5b7f86..a5dd31a5252fe 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -450,7 +450,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) // Not reducing branches of alternations with different casing [InlineData("(?i:abcd)|abcd", "abcd|abcd")] [InlineData("abcd|(?i:abcd)", "abcd|abcd")] - [InlineData("abc(?:(?i:e)|f)", "abc[ef]")] // Not applying auto-atomicity [InlineData("a*b*", "(?>a*)b*")] [InlineData("[ab]*[^a]", "(?>[ab]*)[^a]")] From 502b5059c3644dbe77875905d6ae41645421b2ad Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 15 Nov 2021 16:08:31 -0500 Subject: [PATCH 4/6] Delete more now dead code --- .../Symbolic/SymbolicRegexMatcher.cs | 6 +- .../Symbolic/SymbolicRegexNode.cs | 146 ------------------ 2 files changed, 3 insertions(+), 149 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index fb8bd13e4bec8..21c023345c731 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -258,7 +258,7 @@ private DfaMatchingState Delta(string input, int i, DfaMa minterms.Length : // mintermId = minterms.Length represents \Z (last \n) _partitions.GetMintermID(c); - TSetType minterm = (uint)mintermId < minterms.Length ? + TSetType minterm = (uint)mintermId < (uint)minterms.Length ? minterms[mintermId] : _builder._solver.False; // minterm=False represents \Z @@ -692,7 +692,7 @@ uint GetCharKindWithAnchor(string input, int i) { Debug.Assert(_asciiCharKinds is not null); - if ((uint)i >= input.Length) + if ((uint)i >= (uint)input.Length) { return CharKind.StartStop; } @@ -708,7 +708,7 @@ uint GetCharKindWithAnchor(string input, int i) uint[] asciiCharKinds = _asciiCharKinds; return - nextChar < asciiCharKinds.Length ? asciiCharKinds[nextChar] : + nextChar < (uint)asciiCharKinds.Length ? asciiCharKinds[nextChar] : _builder._solver.And(GetMinterm(nextChar), _builder._wordLetterPredicateForAnchors).Equals(_builder._solver.False) ? 0 : //apply the wordletter predicate to compute the kind of the next character CharKind.WordLetter; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 5ecadcad26ff6..f7992bf43950d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -1424,152 +1424,6 @@ internal bool StartsWithLoop(int upperBoundLowestValue = 1) }; } - /// - /// Gets the string prefix that the regex must match or the empty string if such a prefix does not exist. - /// Sets ignoreCase = true when the prefix works under case-insensitivity. - /// For example if the input prefix is "---" it sets ignoreCase=false, - /// if the prefix is "---[aA][bB]" it returns "---AB" and sets ignoreCase=true - /// - internal string GetFixedPrefix(CharSetSolver css, string culture, out bool ignoreCase) - { - ignoreCase = false; - StringBuilder prefix = new(); - bool doneWithoutIgnoreCase = false; - bool doneWithIgnoreCase = false; - foreach (S x in GetPrefixSequence()) - { - BDD bdd = _builder._solver.ConvertToCharSet(css, x); - char character = (char)bdd.GetMin(); - // Check if the prefix extends without ignore case: the set is a single character - if (!doneWithoutIgnoreCase && !css.IsSingleton(bdd)) - { - doneWithoutIgnoreCase = true; - } - if (!doneWithIgnoreCase) - { - // Check if the prefix extends with ignore case: ignoring case doesn't change the set - if (css.ApplyIgnoreCase(css.CharConstraint(character), culture).Equals(bdd)) - { - // Turn ignoreCase on when the prefix extends only under ignore case - if (doneWithoutIgnoreCase) - { - ignoreCase = true; - } - } - else - { - doneWithIgnoreCase = true; - } - } - // Append the character when the prefix extends in either of the ways - if (!doneWithoutIgnoreCase || !doneWithIgnoreCase) - prefix.Append(character); - else - break; - } - return prefix.ToString(); - } - - private IEnumerable GetPrefixSequence() - { - List> paths = new(); - HashSet> nextPaths = new(); - - paths.Add(this); - while (true) - { - bool done = false; - Debug.Assert(paths.Count > 0, "The generator should have ended when any path fails to extend."); - // Generate the next set from one path - S next; - if (!GetNextPrefixSet(ref paths, ref nextPaths, ref done, out next)) - { - // A path didn't have a next set as supported by this algorithm - yield break; - } - if (!_builder._solver.IsSatisfiable(next)) - { - yield break; - } - while (paths.Count > 0) - { - // For all other paths check that they produce the same set - S newSet; - if (!GetNextPrefixSet(ref paths, ref nextPaths, ref done, out newSet) || !newSet.Equals(next)) - { - // Either a path didn't have a next set as supported by this algorithm, or the next set was not equal - yield break; - } - } - // At this point all paths generated equal next sets - yield return next; - if (done) - { - // Some path had no continuation, end the prefix - yield break; - } - else - { - Debug.Assert(paths.Count == 0, "Not all paths were considered for next set."); - paths.AddRange(nextPaths); - nextPaths.Clear(); - } - } - } - - private bool GetNextPrefixSet(ref List> paths, ref HashSet> nextPaths, ref bool done, out S set) - { - while (paths.Count > 0) - { - SymbolicRegexNode node = paths[paths.Count - 1]; - paths.RemoveAt(paths.Count - 1); - switch (node._kind) - { - case SymbolicRegexKind.Singleton: - Debug.Assert(node._set is not null); - set = node._set; - done = true; // No continuation, done after the next set - return true; - case SymbolicRegexKind.Concat: - Debug.Assert(node._left is not null && node._right is not null); - if (!node._left.CanBeNullable) - { - if (node._left.GetFixedLength() == 1) - { - set = node._left.GetStartSet(); - // Left side had just one character, can use just right side as path - nextPaths.Add(node._right); - return true; - } - else - { - // Left side may need multiple steps to get through. However, it is safe - // (though not complete) to forget the right side and just expand the path - // for the left side. - paths.Add(node._left); - break; - } - } - else - { - // Left side may be nullable, can't extend the prefix - set = _builder._solver.False; // Not going to be used - return false; - } - case SymbolicRegexKind.Or: - case SymbolicRegexKind.And: - Debug.Assert(node._alts is not null); - // Handle alternatives as separate paths - paths.AddRange(node._alts); - break; - default: - set = _builder._solver.False; // Not going to be used - return false; // Cut prefix immediately for unhandled node - } - } - set = _builder._solver.False; // Not going to be used - return false; - } /// Get the predicate that covers all elements that make some progress. internal S GetStartSet() => _startSet; From 51fcda3368827dd995a52b7780c926c5d48ea854 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 16 Nov 2021 22:27:06 -0500 Subject: [PATCH 5/6] Fix dead code emitting after refactoring Previously return statements while emitting anchors were short-circuiting the rest of the emitting code, but when I moved that code into a helper, the returns stopped having that impact, such that we'd end up emitting a return statement and then emit dead code after it. Fix it. --- .../gen/RegexGenerator.Emitter.cs | 113 +++++++++--------- .../Text/RegularExpressions/RegexCompiler.cs | 25 ++-- 2 files changed, 75 insertions(+), 63 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 835339bf8006e..66cd2b68feac7 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -265,62 +265,63 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, }; using (EmitBlock(writer, clause)) { - // Some anchors help to advance the position but don't terminate the operation. - // As such, we do the anchors check first, and then treat them below the same - // as if there's no special searching enabled. - EmitAnchors(); - - // If whatever search operation we need to perform entails case-insensitive operations - // that weren't already handled via creation of sets, we need to get an store the - // TextInfo object to use (unless RegexOptions.CultureInvariant was specified). - EmitTextInfo(writer, ref hasTextInfo, rm); - - // Emit the code for whatever find mode has been determined. - switch (code.FindOptimizations.FindMode) + // Emit any anchors. + if (!EmitAnchors()) { - case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: - Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix); - break; + // Either anchors weren't specified, or they don't completely root all matches to a specific location. - case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: - Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf_RightToLeft(code.FindOptimizations.LeadingCaseSensitivePrefix); - break; + // If whatever search operation we need to perform entails case-insensitive operations + // that weren't already handled via creation of sets, we need to get an store the + // TextInfo object to use (unless RegexOptions.CultureInvariant was specified). + EmitTextInfo(writer, ref hasTextInfo, rm); - case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: - case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: - case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: - case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: - Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - EmitFixedSet_LeftToRight(); - break; + // Emit the code for whatever find mode has been determined. + switch (code.FindOptimizations.FindMode) + { + case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix); + break; - case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: - case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: - Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - EmitFixedSet_RightToLeft(); - break; + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_RightToLeft(code.FindOptimizations.LeadingCaseSensitivePrefix); + break; - // Already emitted earlier - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: - Debug.Assert(code.FindOptimizations.LeadingAnchor != 0); - goto case FindNextStartingPositionMode.NoSearch; + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_LeftToRight(); + break; - default: - Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); - goto case FindNextStartingPositionMode.NoSearch; + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_RightToLeft(); + break; - case FindNextStartingPositionMode.NoSearch: - writer.WriteLine("return true;"); - break; + // Already emitted earlier + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: + case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + Debug.Assert(code.FindOptimizations.LeadingAnchor != 0); + goto case FindNextStartingPositionMode.NoSearch; + + default: + Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); + goto case FindNextStartingPositionMode.NoSearch; + + case FindNextStartingPositionMode.NoSearch: + writer.WriteLine("return true;"); + break; + } } } writer.WriteLine(); @@ -330,7 +331,9 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, writer.WriteLine(!rm.Code.RightToLeft ? "base.runtextpos = runtextend;" : "base.runtextpos = runtextbeg;"); writer.WriteLine("return false;"); - void EmitAnchors() + // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further + // searching is required; otherwise, false. + bool EmitAnchors() { // Generate anchor checks. if ((code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) @@ -356,7 +359,7 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; case RegexPrefixAnalyzer.Start: writer.WriteLine("// Start \\G anchor"); @@ -376,7 +379,7 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; case RegexPrefixAnalyzer.EndZ: // TODO: Why are the LTR and RTL cases inconsistent here with RegexOptions.Compiled? @@ -397,7 +400,7 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check writer.WriteLine("// End \\z anchor"); @@ -416,7 +419,7 @@ void EmitAnchors() } } writer.WriteLine("return true;"); - return; + return true; case RegexPrefixAnalyzer.Bol: // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike @@ -438,6 +441,8 @@ void EmitAnchors() break; } } + + return false; } // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index fd2290b0d9b27..08f2f10a1e1c7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1012,10 +1012,13 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or Ret(); MarkLabel(finishedLengthCheck); - // Some anchors help to advance the position but don't terminate the operation. - // As such, we do the anchors check first, and then treat them below the same - // as if there's no special searching enabled. - GenerateAnchors(); + // Emit any anchors. + if (GenerateAnchors()) + { + return; + } + + // Either anchors weren't specified, or they don't completely root all matches to a specific location. switch (_code.FindOptimizations.FindMode) { @@ -1066,7 +1069,9 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or break; } - void GenerateAnchors() + // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further + // searching is required; otherwise, false. + bool GenerateAnchors() { // Generate anchor checks. if ((_code.FindOptimizations.LeadingAnchor & (RegexPrefixAnalyzer.Beginning | RegexPrefixAnalyzer.Start | RegexPrefixAnalyzer.EndZ | RegexPrefixAnalyzer.End | RegexPrefixAnalyzer.Bol)) != 0) @@ -1095,7 +1100,7 @@ void GenerateAnchors() } Ldc(1); Ret(); - return; + return true; case RegexPrefixAnalyzer.Start: { @@ -1115,7 +1120,7 @@ void GenerateAnchors() } Ldc(1); Ret(); - return; + return true; case RegexPrefixAnalyzer.EndZ: { @@ -1157,7 +1162,7 @@ void GenerateAnchors() } Ldc(1); Ret(); - return; + return true; case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check { @@ -1180,7 +1185,7 @@ void GenerateAnchors() } Ldc(1); Ret(); - return; + return true; case RegexPrefixAnalyzer.Bol: { @@ -1241,6 +1246,8 @@ void GenerateAnchors() break; } } + + return false; } void GenerateIndexOf_LeftToRight(string prefix) From 6ce524fc0d1ed04b1a547cab483bae13b2d4cf06 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 16 Nov 2021 22:44:36 -0500 Subject: [PATCH 6/6] Remove some now dead code --- .../gen/RegexGenerator.Emitter.cs | 14 +------------- .../Text/RegularExpressions/RegexCompiler.cs | 14 +------------- 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 66cd2b68feac7..10579ce61118e 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -302,18 +302,6 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, EmitFixedSet_RightToLeft(); break; - // Already emitted earlier - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: - Debug.Assert(code.FindOptimizations.LeadingAnchor != 0); - goto case FindNextStartingPositionMode.NoSearch; - default: Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); goto case FindNextStartingPositionMode.NoSearch; @@ -402,7 +390,7 @@ bool EmitAnchors() writer.WriteLine("return true;"); return true; - case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check + case RegexPrefixAnalyzer.End: writer.WriteLine("// End \\z anchor"); if (!rtl) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 08f2f10a1e1c7..49b88d3954cab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1046,18 +1046,6 @@ FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive or GenerateFixedSet_RightToLeft(); break; - // Already emitted earlier - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: - case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: - Debug.Assert(_code.FindOptimizations.LeadingAnchor != 0); - goto case FindNextStartingPositionMode.NoSearch; - default: Debug.Fail($"Unexpected mode: {_code.FindOptimizations.FindMode}"); goto case FindNextStartingPositionMode.NoSearch; @@ -1164,7 +1152,7 @@ bool GenerateAnchors() Ret(); return true; - case RegexPrefixAnalyzer.End when minRequiredLength == 0: // if it's > 0, we already output a more stringent check + case RegexPrefixAnalyzer.End: { Label l1 = DefineLabel(); Ldloc(_runtextposLocal);