Skip to content

Commit

Permalink
Captures support for NonBacktracking (#65129)
Browse files Browse the repository at this point in the history
* Initial version of captures in NonBacktracking

* Working version of captures in NonBacktracking

* State/transition priorities for NonBacktracking

* Various fixes for capturing in NonBacktracking

Re-enable replacement patterns.
Fix eager derivative.
Fix capture numbering to work with sparsely numbered groups.

* Enable many subcapture tests for NonBacktracking

Also take any subcaptures out of RegexExperiment intersection and
negation tests, as capture semantics with these are not correct yet.

* Use new eager derivative even without subcaptures

This ensures the correct length matches always.

* Enable more subcapture tests for NonBacktracking

* Performance work for NonBacktracking captures

* Bug fixes and comments

* Fix for changes removing exclusive_end

* Rename back to _lower

* Fix beginning handling for captures

* Resurrect deleted test

* Remove debugging test

* Provide effects semantics for extended combinators

Also the unordered Or, which gets the semantics that all alternatives
are visited.

* Apply suggestions from code review to SparseIntMap

Co-authored-by: Stephen Toub <stoub@microsoft.com>

* Apply suggestions from code review

Cleanup and volatile write

Co-authored-by: Stephen Toub <stoub@microsoft.com>

* Comments, fixes and cleanup

* Disable tests for extended features

Conjunction and complement are broken in the new capturing support.

* Apply suggestions from code review

Fixes for FindEndPositionCapturing

Co-authored-by: Stephen Toub <stoub@microsoft.com>

* Switch all phases to use eager derivative

This allows avoiding capture tracking in third phase if there are
no subcaptures.
As a side effect of this change all derivatives produce OrderedOr nodes,
which for now effectively disables the subsumption optimization.

* Flatten ordered or and add subsumption

Previously the loop subsumption optimization only worked in
SymbolicRegexSet, but that is getting phased out with the order
maintaining derivative. This reimplements a version of that for ordered
ors.
Also do actual canonicalization of ordered ors as we should.

* Avoid some copies and overhead in capturing mode

* Improved comment and a better assert

* Move some per thread state into the runner

This avoids some repeated allocations in the capturing mode.

* Fix typo in SparseIntMap

* Various cleanup

* Fix a comment

* Avoid copying captures when doing quick match

Co-authored-by: Stephen Toub <stoub@microsoft.com>
  • Loading branch information
olsaarik and stephentoub authored Feb 12, 2022
1 parent bda8c91 commit d78094e
Show file tree
Hide file tree
Showing 26 changed files with 1,731 additions and 580 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@
<!-- RegexOptions.NonBacktracking -->
<Compile Include="System\Text\RegularExpressions\Symbolic\BooleanClassifier.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\CharKind.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DerivativeEffect.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DfaMatchingState.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\RegexNodeToSymbolicConverter.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SparseIntMap.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicMatch.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicNFA.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexBuilder.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,21 +117,10 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C
// and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking.
_code = RegexWriter.Write(tree, culture);

if ((options & RegexOptions.NonBacktracking) != 0)
{
// NonBacktracking doesn't support captures (other than the implicit top-level capture).
capnames = null;
capslist = null;
caps = null;
capsize = 1;
}
else
{
capnames = tree.CapNames;
capslist = tree.CapsList;
caps = _code.Caps;
capsize = _code.CapSize;
}
capnames = tree.CapNames;
capslist = tree.CapsList;
caps = _code.Caps;
capsize = _code.CapSize;
}

internal static void ValidatePattern(string pattern)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -533,17 +533,6 @@ private RegexNode ScanReplacement()
if (RightCharMoveRight() == '$')
{
RegexNode node = ScanDollar();

// NonBacktracking does not support capture groups, so any replacement patterns that refer to
// groups are unsupported. However, the replacement patterns that refer to the left/right portion
// or all of the input as well as referring to group 0 (i.e. the whole match) are supported.
if ((_options & RegexOptions.NonBacktracking) != 0 &&
node.Kind == RegexNodeKind.Backreference &&
node.M is not (0 or RegexReplacement.LeftPortion or RegexReplacement.RightPortion or RegexReplacement.WholeString))
{
throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
}

AddUnitNode(node);
}

Expand Down Expand Up @@ -1409,7 +1398,6 @@ private RegexNode ScanDollar()
int capnum = -1;
int newcapnum = ch - '0';
MoveRight();
CheckUnsupportedNonBacktrackingNumericRef(newcapnum);
if (IsCaptureSlot(newcapnum))
{
capnum = newcapnum;
Expand All @@ -1427,7 +1415,6 @@ private RegexNode ScanDollar()
newcapnum = newcapnum * 10 + digit;

MoveRight();
CheckUnsupportedNonBacktrackingNumericRef(newcapnum);
if (IsCaptureSlot(newcapnum))
{
capnum = newcapnum;
Expand All @@ -1445,7 +1432,6 @@ private RegexNode ScanDollar()
int capnum = ScanDecimal();
if (!angled || CharsRight() > 0 && RightCharMoveRight() == '}')
{
CheckUnsupportedNonBacktrackingNumericRef(capnum);
if (IsCaptureSlot(capnum))
{
return new RegexNode(RegexNodeKind.Backreference, _options, capnum);
Expand All @@ -1458,13 +1444,6 @@ private RegexNode ScanDollar()
string capname = ScanCapname();
if (CharsRight() > 0 && RightCharMoveRight() == '}')
{
// Throw unconditionally for non-backtracking, even if not a valid capture name,
// as information to determine whether a name is valid or not isn't tracked.
if ((_options & RegexOptions.NonBacktracking) != 0)
{
throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
}

if (IsCaptureName(capname))
{
return new RegexNode(RegexNodeKind.Backreference, _options, CaptureSlotFromName(capname));
Expand Down Expand Up @@ -1515,16 +1494,6 @@ private RegexNode ScanDollar()
return RegexNode.CreateOneWithCaseConversion('$', _options, _culture);
}

/// <summary>Throws on unsupported capture references for NonBacktracking in replacement patterns.</summary>
private void CheckUnsupportedNonBacktrackingNumericRef(int capnum)
{
// Throw for non-backtracking on non-zero group, even if not a valid capture number, as information to determine whether a name is valid or not isn't tracked
if ((_options & RegexOptions.NonBacktracking) != 0 && capnum != 0)
{
throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
}
}

/*
* Scans a capture name: consumes word chars
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>
/// Describes effects to record capture start and end points.
/// </summary>
/// <remarks>
/// These are applied into registers (arrays of positions for all capture starts and ends) and amount to assignments
/// of the current input position. Effects are generated and associated with transitions in effect-aware versions
/// of MkDerivative in SymbolicRegexNode.
/// </remarks>
internal struct DerivativeEffect
{
public enum EffectKind
{
/// <summary>Effect to assign the current input position to an index in the capture starts array.</summary>
CaptureStart,
/// <summary>Effect to assign the current input position to an index in the capture ends array.</summary>
CaptureEnd,
};

public EffectKind Kind;
public int CaptureNumber;

public DerivativeEffect(EffectKind kind, int captureNumber)
{
Kind = kind;
CaptureNumber = captureNumber;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Net;
using System.Collections.Generic;

namespace System.Text.RegularExpressions.Symbolic
{
Expand Down Expand Up @@ -55,18 +56,22 @@ internal int WatchDog
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;

/// <summary>
/// Compute the target state for the given input minterm.
/// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input.
/// Translates a minterm predicate to a character kind, which is a general categorization of characters used
/// for cheaply deciding the nullability of anchors.
/// </summary>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
internal DfaMatchingState<T> Next(T minterm)
/// <remarks>
/// A False predicate is handled as a special case to indicate the very last \n.
/// </remarks>
/// <param name="minterm">the minterm to translate</param>
/// <returns>the character kind of the minterm</returns>
private uint GetNextCharKind(ref T minterm)
{
ICharAlgebra<T> alg = Node._builder._solver;
T wordLetterPredicate = Node._builder._wordLetterPredicateForAnchors;
T newLinePredicate = Node._builder._newLinePredicate;

// minterm == solver.False is used to represent the very last \n
uint nextCharKind = 0;
uint nextCharKind = CharKind.General;
if (alg.False.Equals(minterm))
{
nextCharKind = CharKind.NewLineS;
Expand All @@ -85,19 +90,55 @@ internal DfaMatchingState<T> Next(T minterm)
{
nextCharKind = CharKind.WordLetter;
}
return nextCharKind;
}

/// <summary>
/// Compute the target state for the given input minterm.
/// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input.
/// </summary>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
internal DfaMatchingState<T> Next(T minterm)
{
uint nextCharKind = GetNextCharKind(ref minterm);

// Combined character context
uint context = CharKind.Context(PrevCharKind, nextCharKind);

// Compute the derivative of the node for the given context
SymbolicRegexNode<T> derivative = Node.MkDerivative(minterm, context);
SymbolicRegexNode<T> derivative = Node.MkDerivativeWithEffects(eager: true).TransitionOrdered(minterm, context);

// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
return Node._builder.MkState(derivative, nextCharKind);
}

/// <summary>
/// Compute a set of transitions for the given minterm.
/// </summary>
/// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
/// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns>
internal IEnumerable<(DfaMatchingState<T>, List<DerivativeEffect>)> AntimirovEagerNextWithEffects(T minterm)
{
uint nextCharKind = GetNextCharKind(ref minterm);

// Combined character context
uint context = CharKind.Context(PrevCharKind, nextCharKind);

// Compute the transitions for the given context
IEnumerable<(SymbolicRegexNode<T>, List<DerivativeEffect>)> derivativesAndEffects =
Node.MkDerivativeWithEffects(eager: true).TransitionsWithEffects(minterm, context);

foreach (var (derivative, effects) in derivativesAndEffects)
{
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
yield return (Node._builder.MkState(derivative, nextCharKind), effects);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullable(uint nextCharKind)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
Expand All @@ -15,14 +16,16 @@ internal sealed class RegexNodeToSymbolicConverter
internal readonly SymbolicRegexBuilder<BDD> _builder;
private readonly CultureInfo _culture;
private readonly Dictionary<(bool, string), BDD> _createConditionFromSet_Cache = new();
private readonly Hashtable? _caps;

/// <summary>Constructs a regex to symbolic finite automata converter</summary>
public RegexNodeToSymbolicConverter(Unicode.UnicodeCategoryTheory<BDD> categorizer, CultureInfo culture)
public RegexNodeToSymbolicConverter(Unicode.UnicodeCategoryTheory<BDD> categorizer, CultureInfo culture, Hashtable? caps)
{
_categorizer = categorizer;
_culture = culture;
Solver = categorizer._solver;
_builder = new SymbolicRegexBuilder<BDD>(Solver);
_caps = caps;
}

/// <summary>The character solver associated with the regex converter</summary>
Expand Down Expand Up @@ -220,7 +223,7 @@ public SymbolicRegexNode<BDD> Convert(RegexNode node, bool topLevel)
{
nested[i] = Convert(node.Child(i), topLevel);
}
return _builder.MkOr(nested);
return _builder.MkOrderedOr(nested);
}

case RegexNodeKind.Beginning:
Expand All @@ -231,15 +234,17 @@ public SymbolicRegexNode<BDD> Convert(RegexNode node, bool topLevel)
return _builder._bolAnchor;

case RegexNodeKind.Capture when node.N == -1:
return Convert(node.Child(0), topLevel); // treat as non-capturing group (...)
int captureNum;
if (_caps == null || !_caps.TryGetValue(node.M, out captureNum))
captureNum = node.M;
return _builder.MkCapture(Convert(node.Child(0), topLevel: false), captureNum);

case RegexNodeKind.Concatenate:
{
List<RegexNode> nested = FlattenNestedConcatenations(node);
var converted = new SymbolicRegexNode<BDD>[nested.Count];
for (int i = 0; i < converted.Length; i++)
var converted = new SymbolicRegexNode<BDD>[node.ChildCount()];
for (int i = 0; i < node.ChildCount(); ++i)
{
converted[i] = Convert(nested[i], topLevel: false);
converted[i] = Convert(node.Child(i), topLevel: false);
}
return _builder.MkConcat(converted, topLevel);
}
Expand Down Expand Up @@ -371,45 +376,6 @@ void EnsureWordLetterPredicateInitialized()
}
}

List<RegexNode> FlattenNestedConcatenations(RegexNode concat)
{
var results = new List<RegexNode>();

var todo = new Stack<RegexNode>();
todo.Push(concat);

while (todo.TryPop(out RegexNode? node))
{
if (node.Kind == RegexNodeKind.Concatenate)
{
// Flatten nested concatenations
for (int i = node.ChildCount() - 1; i >= 0; i--)
{
todo.Push(node.Child(i));
}
}
else if (node.Kind == RegexNodeKind.Capture)
{
if (node.N == -1)
{
// Unwrap nonbalancing capture groups
todo.Push(node.Child(0));
}
else
{
// Balancing groups are not supported
throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, SR.ExpressionDescription_BalancingGroup));
}
}
else
{
results.Add(node);
}
}

return results;
}

SymbolicRegexNode<BDD> ConvertMulti(RegexNode node, bool topLevel)
{
Debug.Assert(node.Kind == RegexNodeKind.Multi);
Expand Down
Loading

0 comments on commit d78094e

Please sign in to comment.