Skip to content
Merged

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ private void MakeLoopAtomic()
[Conditional("DEBUG")]
private void ValidateFinalTreeInvariants()
{
Debug.Assert(Type == Capture, "Every generated tree should begin with a capture node");

var toExamine = new Stack<RegexNode>();
toExamine.Push(this);
while (toExamine.Count > 0)
Expand Down Expand Up @@ -306,8 +308,11 @@ private void ValidateFinalTreeInvariants()
break;

case Testref:
Debug.Assert(childCount is 1 or 2, $"Expected one or two children for {node.TypeName}, got {childCount}");
break;

case Testgroup:
Debug.Assert(childCount >= 1, $"Expected at least one child for {node.TypeName}, got {childCount}.");
Debug.Assert(childCount is 2 or 3, $"Expected two or three children for {node.TypeName}, got {childCount}");
break;

case Concatenate:
Expand Down Expand Up @@ -2229,9 +2234,18 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
case Empty:
case Nothing:
case UpdateBumpalong:
// Backreferences are supported
case Ref:
supported = true;
break;

// Conditional backreference tests are also supported, so long as both their yes/no branches are supported.
case Testref:
supported =
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
(childCount == 1 || Child(1).SupportsSimplifiedCodeGenerationImplementation());
break;

// Single character greedy/lazy loops are supported if either they're actually a repeater
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
case Oneloop:
Expand All @@ -2244,17 +2258,10 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
supported = M == N || AncestorsAllowBacktracking(Next);
break;

// Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
// For greedy and lazy loops, they're supported if the node they wrap is supported
// and either the node is actually a repeater, is atomic, or is in the tree in a
// location where backtracking is allowed.
case Loop:
supported =
(M == N || (Next != null && Next.Type == Atomic)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// Similarly, as long as the wrapped node supports simplified code gen,
// Lazy is supported if it's a repeater or atomic, but also if it's in
// a place where backtracking is allowed (e.g. it's top-level).
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) &&
Expand Down Expand Up @@ -2297,11 +2304,10 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
break;

case Capture:
// Currently we only support capnums without uncapnums (for balancing groups)
supported = N == -1;
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
if (supported)
{
// And we only support them in certain places in the tree.
// Captures are currently only supported in certain places in the tree.
RegexNode? parent = Next;
while (parent != null)
{
Expand All @@ -2322,25 +2328,31 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
}
}

// If we've found a supported capture, mark all of the nodes in its parent
// hierarchy as containing a capture.
if (supported)
{
// And we only support them if their children are supported.
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();

// If we've found a supported capture, mark all of the nodes in its parent
// hierarchy as containing a capture.
if (supported)
parent = this;
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
{
parent = this;
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
{
parent.Options |= HasCapturesFlag;
parent = parent.Next;
}
parent.Options |= HasCapturesFlag;
parent = parent.Next;
}
}
}
break;

case Testgroup:
supported =
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
Child(1).SupportsSimplifiedCodeGenerationImplementation() &&
(childCount == 2 || Child(2).SupportsSimplifiedCodeGenerationImplementation());
break;

default:
Debug.Fail($"Unknown type: {Type}");
supported = false;
break;
}
}
#if DEBUG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2071,6 +2071,17 @@ private bool IsCaptureSlot(int i)
return i >= 0 && i < _capsize;
}

/// <summary>
/// When generating code on a regex that uses a sparse set
/// of capture slots, we hash them to a dense set of indices
/// for an array of capture slots. Instead of doing the hash
/// at match time, it's done at compile time, here.
/// </summary>
internal static int MapCaptureNumber(int capnum, Hashtable? caps) =>
capnum == -1 ? -1 :
caps != null ? (int)caps[capnum]! :
capnum;

/// <summary>Looks up the slot number for a given name</summary>
private bool IsCaptureName(string capname) => _capnames != null && _capnames.ContainsKey(capname);

Expand Down Expand Up @@ -2171,7 +2182,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement)
_concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture));
break;

case > 1 when !UseOptionI() || isReplacement:
case > 1 when !UseOptionI() || isReplacement || !RegexCharClass.ParticipatesInCaseConversion(_pattern.AsSpan(pos, cch)):
_concatenation!.AddChild(new RegexNode(RegexNode.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch)));
break;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,17 +214,6 @@ private int StringCode(string str)
return i;
}

/// <summary>
/// When generating code on a regex that uses a sparse set
/// of capture slots, we hash them to a dense set of indices
/// for an array of capture slots. Instead of doing the hash
/// at match time, it's done at compile time, here.
/// </summary>
private int MapCapnum(int capnum) =>
capnum == -1 ? -1 :
_caps != null ? (int)_caps[capnum]! :
capnum;

/// <summary>
/// The main RegexCode generator. It does a depth-first walk
/// through the tree and calls EmitFragment to emits code before
Expand Down Expand Up @@ -283,7 +272,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Setjump);
_intStack.Append(_emitted.Length);
Emit(RegexCode.Lazybranch, 0);
Emit(RegexCode.Testref, MapCapnum(node.M));
Emit(RegexCode.Testref, RegexParser.MapCaptureNumber(node.M, _caps));
Emit(RegexCode.Forejump);
break;
}
Expand Down Expand Up @@ -391,7 +380,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
break;

case RegexNode.Capture | AfterChild:
Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N));
Emit(RegexCode.Capturemark, RegexParser.MapCaptureNumber(node.M, _caps), RegexParser.MapCaptureNumber(node.N, _caps));
break;

case RegexNode.Require | BeforeChild:
Expand Down Expand Up @@ -471,7 +460,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
break;

case RegexNode.Ref:
Emit(node.Type | bits, MapCapnum(node.M));
Emit(node.Type | bits, RegexParser.MapCaptureNumber(node.M, _caps));
break;

case RegexNode.Nothing:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ public static IEnumerable<object[]> ValidateRegex_MemberData()

(string Pattern, RegexOptions Options, string Input, string Expected) testCase = allEngineCases[i];
yield return new object[] { engine, testCase.Pattern, testCase.Options, results[i], testCase.Input, expected };
yield return new object[] { engine, testCase.Pattern, testCase.Options | RegexOptions.CultureInvariant, results[i], testCase.Input, expected };
if ((testCase.Options & RegexOptions.IgnoreCase) != 0)
{
yield return new object[] { engine, testCase.Pattern, testCase.Options | RegexOptions.CultureInvariant, results[i], testCase.Input, expected };
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,15 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
yield return new object[] { engine, null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };

// Captures inside varying constructs with backtracking needing to uncapture
yield return new object[] { engine, null, @"a(bc)d|abc(e)", "abce", RegexOptions.None, new string[] { "abce", "", "e" } }; // alternation
yield return new object[] { engine, null, @"((ab){2}cd)*", "ababcdababcdababc", RegexOptions.None, new string[] { "ababcdababcd", "ababcd", "ab" } }; // loop
yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "aba", RegexOptions.None, new string[] { "a", "", "" } }; // positive lookahead in a loop
yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "ababa", RegexOptions.None, new string[] { "aba", "ab", "a" } }; // positive lookahead in a loop
yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "abababa", RegexOptions.None, new string[] { "ababa", "ab", "a" } }; // positive lookahead in a loop
yield return new object[] { engine, null, @"\w\w(?!(\d)\d)", "aa..", RegexOptions.None, new string[] { "aa", "" } }; // negative lookahead
yield return new object[] { engine, null, @"\w\w(?!(\d)\d)", "aa.3", RegexOptions.None, new string[] { "aa", "" } }; // negative lookahead

// Quantifiers
yield return new object[] { engine, null, @"a*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { engine, null, @"a*", "a", RegexOptions.None, new string[] { "a" } };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,6 @@ public async Task Docs_GroupingConstructs_NonbacktrackingSubexpressions(RegexEng

Regex rBack = await RegexHelpers.GetRegexAsync(engine, @"(\w)\1+.\b");
Regex rNoBack = await RegexHelpers.GetRegexAsync(engine, @"(?>(\w)\1+).\b");
string[] inputs = { "aaad", "aaaa" };

Match back, noback;

back = rBack.Match("cccd.");
Expand Down Expand Up @@ -1117,6 +1115,95 @@ public async Task Docs_Anchors_ContiguousMatches(RegexEngine engine)
Regex.Replace(Input, Pattern, m => string.Concat(m.Value.Reverse())));
}

//
// Based on examples from https://blog.stevenlevithan.com/archives/balancing-groups
//

[Theory]
[MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task Blog_Levithan_BalancingGroups_Palindromes(RegexEngine engine)
{
if (RegexHelpers.IsNonBacktracking(engine))
{
// balancing groups not supported
return;
}

Regex r = await RegexHelpers.GetRegexAsync(engine, @"(?<N>.)+.?(?<-N>\k<N>)+(?(N)(?!))");

// Palindromes
Assert.All(new[]
{
"kayak",
"racecar",
"never odd or even",
"madam im adam"
}, p => Assert.True(r.IsMatch(p)));

// Non-Palindromes
Assert.All(new[]
{
"canoe",
"raceboat"
}, p => Assert.False(r.IsMatch(p)));
}

[Theory]
[MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task Blog_Levithan_BalancingGroups_MatchingParentheses(RegexEngine engine)
{
if (RegexHelpers.IsNonBacktracking(engine))
{
// balancing groups not supported
return;
}

Regex r = await RegexHelpers.GetRegexAsync(engine, @"^\(
(?>
[^()]+
|
\( (?<Depth>)
|
\) (?<-Depth>)
)*
(?(Depth)(?!))
\)$", RegexOptions.IgnorePatternWhitespace);

Assert.True(r.IsMatch("()"));
Assert.True(r.IsMatch("(a(b c(de(f(g)hijkl))mn))"));

Assert.False(r.IsMatch("("));
Assert.False(r.IsMatch(")"));
Assert.False(r.IsMatch("())"));
Assert.False(r.IsMatch("(()"));
Assert.False(r.IsMatch("(ab(cd)ef"));
}

[Theory]
[MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task Blog_Levithan_BalancingGroups_WordLengthIncreases(RegexEngine engine)
{
if (RegexHelpers.IsNonBacktracking(engine))
{
// balancing groups not supported
return;
}

Regex r = await RegexHelpers.GetRegexAsync(engine, @"^(?:
(?(A)\s|)
(?<B>)
(?<C-B>\w)+ (?(B)(?!))
(?:
\s
(?<C>)
(?<B-C>\w)+ (?(C)(?!))
(?<A>)
)?
)+ \b$", RegexOptions.IgnorePatternWhitespace);

Assert.True(r.IsMatch("a bc def ghij klmni"));
Assert.False(r.IsMatch("a bc def ghi klmn"));
}

//
// These patterns come from real-world customer usages
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ internal static async Task<Regex[]> SourceGenRegexAsync(
if (generatorResults.Diagnostics.Length != 0)
{
throw new ArgumentException(
string.Join(Environment.NewLine, generatorResults.Diagnostics) + Environment.NewLine +
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))));
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine +
string.Join(Environment.NewLine, generatorResults.Diagnostics));
}

// Compile the assembly to a stream
Expand All @@ -122,8 +122,8 @@ internal static async Task<Regex[]> SourceGenRegexAsync(
if (!results.Success || results.Diagnostics.Length != 0)
{
throw new ArgumentException(
string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics)) + Environment.NewLine +
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))));
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine +
string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics)));
}
dll.Position = 0;

Expand Down