Captures support for NonBacktracking (#65129)

* Initial version of captures in NonBacktracking * Working version of captures in NonBacktracking * State/transition priorities for NonBacktracking * Various fixes for capturing in NonBacktracking Re-enable replacement patterns. Fix eager derivative. Fix capture numbering to work with sparsely numbered groups. * Enable many subcapture tests for NonBacktracking Also take any subcaptures out of RegexExperiment intersection and negation tests, as capture semantics with these are not correct yet. * Use new eager derivative even without subcaptures This ensures the correct length matches always. * Enable more subcapture tests for NonBacktracking * Performance work for NonBacktracking captures * Bug fixes and comments * Fix for changes removing exclusive_end * Rename back to _lower * Fix beginning handling for captures * Resurrect deleted test * Remove debugging test * Provide effects semantics for extended combinators Also the unordered Or, which gets the semantics that all alternatives are visited. * Apply suggestions from code review to SparseIntMap Co-authored-by: Stephen Toub <stoub@microsoft.com> * Apply suggestions from code review Cleanup and volatile write Co-authored-by: Stephen Toub <stoub@microsoft.com> * Comments, fixes and cleanup * Disable tests for extended features Conjunction and complement are broken in the new capturing support. * Apply suggestions from code review Fixes for FindEndPositionCapturing Co-authored-by: Stephen Toub <stoub@microsoft.com> * Switch all phases to use eager derivative This allows avoiding capture tracking in third phase if there are no subcaptures. As a side effect of this change all derivatives produce OrderedOr nodes, which for now effectively disables the subsumption optimization. * Flatten ordered or and add subsumption Previously the loop subsumption optimization only worked in SymbolicRegexSet, but that is getting phased out with the order maintaining derivative. This reimplements a version of that for ordered ors. Also do actual canonicalization of ordered ors as we should. * Avoid some copies and overhead in capturing mode * Improved comment and a better assert * Move some per thread state into the runner This avoids some repeated allocations in the capturing mode. * Fix typo in SparseIntMap * Various cleanup * Fix a comment * Avoid copying captures when doing quick match Co-authored-by: Stephen Toub <stoub@microsoft.com>
dotnet · Feb 12, 2022 · d78094e · d78094e
1 parent bda8c91
commit d78094e
Show file tree

Hide file tree

Showing 26 changed files with 1,731 additions and 580 deletions.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -54,9 +54,11 @@
     <!-- RegexOptions.NonBacktracking -->
     <Compile Include="System\Text\RegularExpressions\Symbolic\BooleanClassifier.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\CharKind.cs" />
+    <Compile Include="System\Text\RegularExpressions\Symbolic\DerivativeEffect.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\DfaMatchingState.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\RegexNodeToSymbolicConverter.cs" />
+    <Compile Include="System\Text\RegularExpressions\Symbolic\SparseIntMap.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicMatch.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicNFA.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\SymbolicRegexBuilder.cs" />

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
@@ -117,21 +117,10 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C
             // and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking.
             _code = RegexWriter.Write(tree, culture);
 
-            if ((options & RegexOptions.NonBacktracking) != 0)
-            {
-                // NonBacktracking doesn't support captures (other than the implicit top-level capture).
-                capnames = null;
-                capslist = null;
-                caps = null;
-                capsize = 1;
-            }
-            else
-            {
-                capnames = tree.CapNames;
-                capslist = tree.CapsList;
-                caps = _code.Caps;
-                capsize = _code.CapSize;
-            }
+            capnames = tree.CapNames;
+            capslist = tree.CapsList;
+            caps = _code.Caps;
+            capsize = _code.CapSize;
         }
 
         internal static void ValidatePattern(string pattern)

diff --git a/...ibraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/...ibraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
@@ -533,17 +533,6 @@ private RegexNode ScanReplacement()
                     if (RightCharMoveRight() == '$')
                     {
                         RegexNode node = ScanDollar();
-
-                        // NonBacktracking does not support capture groups, so any replacement patterns that refer to
-                        // groups are unsupported. However, the replacement patterns that refer to the left/right portion
-                        // or all of the input as well as referring to group 0 (i.e. the whole match) are supported.
-                        if ((_options & RegexOptions.NonBacktracking) != 0 &&
-                            node.Kind == RegexNodeKind.Backreference &&
-                            node.M is not (0 or RegexReplacement.LeftPortion or RegexReplacement.RightPortion or RegexReplacement.WholeString))
-                        {
-                            throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
-                        }
-
                         AddUnitNode(node);
                     }
 
@@ -1409,7 +1398,6 @@ private RegexNode ScanDollar()
                     int capnum = -1;
                     int newcapnum = ch - '0';
                     MoveRight();
-                    CheckUnsupportedNonBacktrackingNumericRef(newcapnum);
                     if (IsCaptureSlot(newcapnum))
                     {
                         capnum = newcapnum;
@@ -1427,7 +1415,6 @@ private RegexNode ScanDollar()
                         newcapnum = newcapnum * 10 + digit;
 
                         MoveRight();
-                        CheckUnsupportedNonBacktrackingNumericRef(newcapnum);
                         if (IsCaptureSlot(newcapnum))
                         {
                             capnum = newcapnum;
@@ -1445,7 +1432,6 @@ private RegexNode ScanDollar()
                     int capnum = ScanDecimal();
                     if (!angled || CharsRight() > 0 && RightCharMoveRight() == '}')
                     {
-                        CheckUnsupportedNonBacktrackingNumericRef(capnum);
                         if (IsCaptureSlot(capnum))
                         {
                             return new RegexNode(RegexNodeKind.Backreference, _options, capnum);
@@ -1458,13 +1444,6 @@ private RegexNode ScanDollar()
                 string capname = ScanCapname();
                 if (CharsRight() > 0 && RightCharMoveRight() == '}')
                 {
-                    // Throw unconditionally for non-backtracking, even if not a valid capture name,
-                    // as information to determine whether a name is valid or not isn't tracked.
-                    if ((_options & RegexOptions.NonBacktracking) != 0)
-                    {
-                        throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
-                    }
-
                     if (IsCaptureName(capname))
                     {
                         return new RegexNode(RegexNodeKind.Backreference, _options, CaptureSlotFromName(capname));
@@ -1515,16 +1494,6 @@ private RegexNode ScanDollar()
             return RegexNode.CreateOneWithCaseConversion('$', _options, _culture);
         }
 
-        /// <summary>Throws on unsupported capture references for NonBacktracking in replacement patterns.</summary>
-        private void CheckUnsupportedNonBacktrackingNumericRef(int capnum)
-        {
-            // Throw for non-backtracking on non-zero group, even if not a valid capture number, as information to determine whether a name is valid or not isn't tracked
-            if ((_options & RegexOptions.NonBacktracking) != 0 && capnum != 0)
-            {
-                throw new NotSupportedException(SR.NotSupported_NonBacktrackingAndReplacementsWithSubstitutionsOfGroups);
-            }
-        }
-
         /*
          * Scans a capture name: consumes word chars
          */

diff --git a/...m.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DerivativeEffect.cs b/...m.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DerivativeEffect.cs
@@ -0,0 +1,33 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+    /// <summary>
+    /// Describes effects to record capture start and end points.
+    /// </summary>
+    /// <remarks>
+    /// These are applied into registers (arrays of positions for all capture starts and ends) and amount to assignments
+    /// of the current input position. Effects are generated and associated with transitions in effect-aware versions
+    /// of MkDerivative in SymbolicRegexNode.
+    /// </remarks>
+    internal struct DerivativeEffect
+    {
+        public enum EffectKind
+        {
+            /// <summary>Effect to assign the current input position to an index in the capture starts array.</summary>
+            CaptureStart,
+            /// <summary>Effect to assign the current input position to an index in the capture ends array.</summary>
+            CaptureEnd,
+        };
+
+        public EffectKind Kind;
+        public int CaptureNumber;
+
+        public DerivativeEffect(EffectKind kind, int captureNumber)
+        {
+            Kind = kind;
+            CaptureNumber = captureNumber;
+        }
+    }
+}
diff --git a/...m.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/...m.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
@@ -4,6 +4,7 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Net;
+using System.Collections.Generic;
 
 namespace System.Text.RegularExpressions.Symbolic
 {
@@ -55,18 +56,22 @@ internal int WatchDog
         internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
 
         /// <summary>
-        /// Compute the target state for the given input minterm.
-        /// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input.
+        /// Translates a minterm predicate to a character kind, which is a general categorization of characters used
+        /// for cheaply deciding the nullability of anchors.
         /// </summary>
-        /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
-        internal DfaMatchingState<T> Next(T minterm)
+        /// <remarks>
+        /// A False predicate is handled as a special case to indicate the very last \n.
+        /// </remarks>
+        /// <param name="minterm">the minterm to translate</param>
+        /// <returns>the character kind of the minterm</returns>
+        private uint GetNextCharKind(ref T minterm)
         {
             ICharAlgebra<T> alg = Node._builder._solver;
             T wordLetterPredicate = Node._builder._wordLetterPredicateForAnchors;
             T newLinePredicate = Node._builder._newLinePredicate;
 
             // minterm == solver.False is used to represent the very last \n
-            uint nextCharKind = 0;
+            uint nextCharKind = CharKind.General;
             if (alg.False.Equals(minterm))
             {
                 nextCharKind = CharKind.NewLineS;
@@ -85,19 +90,55 @@ internal DfaMatchingState<T> Next(T minterm)
             {
                 nextCharKind = CharKind.WordLetter;
             }
+            return nextCharKind;
+        }
+
+        /// <summary>
+        /// Compute the target state for the given input minterm.
+        /// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input.
+        /// </summary>
+        /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
+        internal DfaMatchingState<T> Next(T minterm)
+        {
+            uint nextCharKind = GetNextCharKind(ref minterm);
 
             // Combined character context
             uint context = CharKind.Context(PrevCharKind, nextCharKind);
 
             // Compute the derivative of the node for the given context
-            SymbolicRegexNode<T> derivative = Node.MkDerivative(minterm, context);
+            SymbolicRegexNode<T> derivative = Node.MkDerivativeWithEffects(eager: true).TransitionOrdered(minterm, context);
 
             // nextCharKind will be the PrevCharKind of the target state
             // use an existing state instead if one exists already
             // otherwise create a new new id for it
             return Node._builder.MkState(derivative, nextCharKind);
         }
 
+        /// <summary>
+        /// Compute a set of transitions for the given minterm.
+        /// </summary>
+        /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
+        /// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns>
+        internal IEnumerable<(DfaMatchingState<T>, List<DerivativeEffect>)> AntimirovEagerNextWithEffects(T minterm)
+        {
+            uint nextCharKind = GetNextCharKind(ref minterm);
+
+            // Combined character context
+            uint context = CharKind.Context(PrevCharKind, nextCharKind);
+
+            // Compute the transitions for the given context
+            IEnumerable<(SymbolicRegexNode<T>, List<DerivativeEffect>)> derivativesAndEffects =
+                Node.MkDerivativeWithEffects(eager: true).TransitionsWithEffects(minterm, context);
+
+            foreach (var (derivative, effects) in derivativesAndEffects)
+            {
+                // nextCharKind will be the PrevCharKind of the target state
+                // use an existing state instead if one exists already
+                // otherwise create a new new id for it
+                yield return (Node._builder.MkState(derivative, nextCharKind), effects);
+            }
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullable(uint nextCharKind)
         {

diff --git a/...arExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs b/...arExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Collections;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Globalization;
@@ -15,14 +16,16 @@ internal sealed class RegexNodeToSymbolicConverter
         internal readonly SymbolicRegexBuilder<BDD> _builder;
         private readonly CultureInfo _culture;
         private readonly Dictionary<(bool, string), BDD> _createConditionFromSet_Cache = new();
+        private readonly Hashtable? _caps;
 
         /// <summary>Constructs a regex to symbolic finite automata converter</summary>
-        public RegexNodeToSymbolicConverter(Unicode.UnicodeCategoryTheory<BDD> categorizer, CultureInfo culture)
+        public RegexNodeToSymbolicConverter(Unicode.UnicodeCategoryTheory<BDD> categorizer, CultureInfo culture, Hashtable? caps)
         {
             _categorizer = categorizer;
             _culture = culture;
             Solver = categorizer._solver;
             _builder = new SymbolicRegexBuilder<BDD>(Solver);
+            _caps = caps;
         }
 
         /// <summary>The character solver associated with the regex converter</summary>
@@ -220,7 +223,7 @@ public SymbolicRegexNode<BDD> Convert(RegexNode node, bool topLevel)
                         {
                             nested[i] = Convert(node.Child(i), topLevel);
                         }
-                        return _builder.MkOr(nested);
+                        return _builder.MkOrderedOr(nested);
                     }
 
                 case RegexNodeKind.Beginning:
@@ -231,15 +234,17 @@ public SymbolicRegexNode<BDD> Convert(RegexNode node, bool topLevel)
                     return _builder._bolAnchor;
 
                 case RegexNodeKind.Capture when node.N == -1:
-                    return Convert(node.Child(0), topLevel); // treat as non-capturing group (...)
+                    int captureNum;
+                    if (_caps == null || !_caps.TryGetValue(node.M, out captureNum))
+                        captureNum = node.M;
+                    return _builder.MkCapture(Convert(node.Child(0), topLevel: false), captureNum);
 
                 case RegexNodeKind.Concatenate:
                     {
-                        List<RegexNode> nested = FlattenNestedConcatenations(node);
-                        var converted = new SymbolicRegexNode<BDD>[nested.Count];
-                        for (int i = 0; i < converted.Length; i++)
+                        var converted = new SymbolicRegexNode<BDD>[node.ChildCount()];
+                        for (int i = 0; i < node.ChildCount(); ++i)
                         {
-                            converted[i] = Convert(nested[i], topLevel: false);
+                            converted[i] = Convert(node.Child(i), topLevel: false);
                         }
                         return _builder.MkConcat(converted, topLevel);
                     }
@@ -371,45 +376,6 @@ void EnsureWordLetterPredicateInitialized()
                 }
             }
 
-            List<RegexNode> FlattenNestedConcatenations(RegexNode concat)
-            {
-                var results = new List<RegexNode>();
-
-                var todo = new Stack<RegexNode>();
-                todo.Push(concat);
-
-                while (todo.TryPop(out RegexNode? node))
-                {
-                    if (node.Kind == RegexNodeKind.Concatenate)
-                    {
-                        // Flatten nested concatenations
-                        for (int i = node.ChildCount() - 1; i >= 0; i--)
-                        {
-                            todo.Push(node.Child(i));
-                        }
-                    }
-                    else if (node.Kind == RegexNodeKind.Capture)
-                    {
-                        if (node.N == -1)
-                        {
-                            // Unwrap nonbalancing capture groups
-                            todo.Push(node.Child(0));
-                        }
-                        else
-                        {
-                            // Balancing groups are not supported
-                            throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, SR.ExpressionDescription_BalancingGroup));
-                        }
-                    }
-                    else
-                    {
-                        results.Add(node);
-                    }
-                }
-
-                return results;
-            }
-
             SymbolicRegexNode<BDD> ConvertMulti(RegexNode node, bool topLevel)
             {
                 Debug.Assert(node.Kind == RegexNodeKind.Multi);