From 34eba54a2f3480d2cd70f0f59d04690bb5f68e8a Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Fri, 24 May 2024 12:37:43 +0300
Subject: [PATCH 01/63] Regex automata optimizations

---
 .../Symbolic/MatchingState.cs                 |   8 +-
 .../Symbolic/MintermClassifier.cs             |  51 ++--
 .../RegularExpressions/Symbolic/StateFlags.cs |   6 +-
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  16 +-
 .../Symbolic/SymbolicRegexMatcher.cs          | 261 ++++++++++++++++--
 .../Symbolic/SymbolicRegexThresholds.cs       |   7 +-
 6 files changed, 282 insertions(+), 67 deletions(-)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index dce65a9996330..3c3029fb5a451 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -106,10 +106,9 @@ internal bool IsNullableFor(uint nextCharKind)
         /// <summary>
         /// Builds a <see cref="StateFlags"/> with the relevant flags set.
         /// </summary>
-        /// <param name="solver">a solver for <typeparamref name="TSet"/></param>
         /// <param name="isInitial">whether this state is an initial state</param>
         /// <returns>the flags for this matching state</returns>
-        internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
+        internal StateFlags BuildStateFlags(bool isInitial)
         {
             StateFlags info = 0;
 
@@ -118,11 +117,6 @@ internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
                 info |= StateFlags.IsInitialFlag;
             }
 
-            if (IsDeadend(solver))
-            {
-                info |= StateFlags.IsDeadendFlag;
-            }
-
             if (Node.CanBeNullable)
             {
                 info |= StateFlags.CanBeNullableFlag;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index d00fcc0d62ff4..9fd9f85ac4f96 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -21,16 +21,15 @@ namespace System.Text.RegularExpressions.Symbolic
     internal sealed class MintermClassifier
     {
         /// <summary>An array used when there's a single minterm, in order to map every ASCII character to it trivially.</summary>
-        private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
+        // private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
+        private readonly int[] _lookup;
 
-        /// <summary>Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID.</summary>
-        private readonly int[] _ascii;
-        /// <summary>A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.</summary>
-        /// <remarks>
-        /// The use of a multi-terminal BDD here is an implementation detail.  Should we decide its important to optimize non-ASCII inputs further,
-        /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
-        /// </remarks>
-        private readonly BDD _nonAscii;
+        // /// <summary>A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.</summary>
+        // /// <remarks>
+        // /// The use of a multi-terminal BDD here is an implementation detail.  Should we decide its important to optimize non-ASCII inputs further,
+        // /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
+        // /// </remarks>
+        // private readonly BDD _nonAscii;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
@@ -39,12 +38,13 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver)
         {
             Debug.Assert(minterms.Length > 0, "Requires at least");
 
+            var lookup = new int[ushort.MaxValue];
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
                 // For ASCII, use an array containing all zeros.  For non-ASCII, use a BDD that maps everything to 0.
-                _ascii = AllAsciiIsZeroMintermArray;
-                _nonAscii = solver.ReplaceTrue(BDD.True, 0);
+                _lookup = lookup;
+                // _nonAscii = solver.ReplaceTrue(BDD.True, 0);
                 return;
             }
 
@@ -65,36 +65,21 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver)
                 anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
             }
 
-            // Now that we have our mapping that supports any input character, we want to optimize for
-            // ASCII inputs.  Rather than forcing every input ASCII character to consult the BDD at match
-            // time, we precompute a lookup table, where each ASCII character can be used to index into the
-            // array to determine the ID for its corresponding minterm.
-            var ascii = new int[128];
-            for (int i = 0; i < ascii.Length; i++)
+            // TODO: this could be initialized more efficiently but it's
+            // a fundamentally different design choice that preallocates more memory.
+            // the minterm slice [1..] contains the ranges that should be really initialized
+            for (int i = 0; i < ushort.MaxValue; i++)
             {
-                ascii[i] = anyCharacterToMintermId.Find(i);
+                lookup[i] = anyCharacterToMintermId.Find(i);
             }
-            _ascii = ascii;
-
-            // We can also further optimize the BDD in two ways:
-            // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
-            //    for ASCII inputs and thus will never use the BDD for them.  While optional (skipping this step will not
-            //    affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
-            // 2. We can check if every character now maps to the same minterm ID (the same terminal in the
-            //    multi-terminal BDD).  This can be relatively common after (1) above is applied, as many
-            //    patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*").  If every character
-            //    in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
-            BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver.NonAscii);
-            nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
-            _nonAscii = nonAsciiBDD;
+            _lookup = lookup;
         }
 
         /// <summary>Gets the ID of the minterm associated with the specified character.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
-            int[] ascii = _ascii;
-            return (uint)c < (uint)ascii.Length ? ascii[c] : _nonAscii.Find(c);
+            return _lookup[c];
         }
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
index 5a620f3771be6..cef4fdfc1ed13 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
@@ -6,7 +6,7 @@ namespace System.Text.RegularExpressions.Symbolic
     /// <summary>
     /// These flags provide context-independent information available for every state. They provide a fast way to evaluate
     /// conditions in the inner matching loops of <see cref="SymbolicRegexMatcher{TSet}"/>. The matcher caches one of these
-    /// for every state, for which they are created by <see cref="MatchingState{TSet}.BuildStateFlags(ISolver{TSet}, bool)"/>.
+    /// for every state, for which they are created by <see cref="MatchingState{TSet}.BuildStateFlags(bool)"/>.
     /// In DFA mode the cached flags are used directly, while in NFA mode the <see cref="SymbolicRegexMatcher{TSet}.NfaStateHandler"/>
     /// handles aggregating the flags in the state set.
     /// </summary>
@@ -14,10 +14,10 @@ namespace System.Text.RegularExpressions.Symbolic
     internal enum StateFlags : byte
     {
         IsInitialFlag = 1,
-        IsDeadendFlag = 2,
         IsNullableFlag = 4,
         CanBeNullableFlag = 8,
         SimulatesBacktrackingFlag = 16,
+        IsAcceleratedFlag = 32,
     }
 
     /// <summary>
@@ -26,9 +26,9 @@ internal enum StateFlags : byte
     internal static class StateFlagsExtensions
     {
         internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != 0;
-        internal static bool IsDeadend(this StateFlags info) => (info & StateFlags.IsDeadendFlag) != 0;
         internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != 0;
         internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != 0;
         internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != 0;
+        internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != 0;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 278b69fe391fe..d20fe6ef13bca 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -40,6 +40,16 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// </summary>
         private StateFlags[] _stateFlagsArray;
 
+        /// <summary>
+        /// Used to short-circuit nullability in the hot loop
+        /// </summary>
+        private bool[] _canBeNullableArray;
+
+        /// <summary>
+        /// Used to short-circuit accelerated states in the hot loop
+        /// </summary>
+        private bool[] _canBeAcceleratedArray;
+
         /// <summary>
         /// The transition function for DFA mode.
         /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
@@ -178,9 +188,13 @@ private MatchingState<TSet> GetOrCreateState_NoLock(SymbolicRegexNode<TSet> node
                     ArrayResizeAndVolatilePublish(ref _stateArray, newsize);
                     ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog);
                     ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize);
+                    ArrayResizeAndVolatilePublish(ref _canBeNullableArray, newsize);
+                    ArrayResizeAndVolatilePublish(ref _canBeAcceleratedArray, newsize);
                 }
                 _stateArray[state.Id] = state;
-                _stateFlagsArray[state.Id] = state.BuildStateFlags(Solver, isInitialState);
+                _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState);
+                _canBeNullableArray[state.Id] = _stateFlagsArray[state.Id].CanBeNullable();
+                _canBeAcceleratedArray[state.Id] = _stateFlagsArray[state.Id].IsAccelerated();
             }
 
             return state;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 4394329f8eae2..a27bcf26cbd53 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -81,6 +81,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
         private readonly RegexFindOptimizations? _findOpts;
 
+        /// <summary>TODO: summarize</summary>
+        private readonly int _deadStateId;
+
         /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _initialStates;
@@ -172,6 +175,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             // Initialization for fields in SymbolicRegexMatcher.Automata.cs
             _stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
             _stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
+            _canBeNullableArray = new bool[InitialDfaStateCapacity];
+            _canBeAcceleratedArray = new bool[InitialDfaStateCapacity];
             _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog];
 
             // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm
@@ -189,7 +194,28 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             if (findOptimizations.IsUseful &&
                 findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning)
             {
-                _findOpts = findOptimizations;
+                var setIsTooCommon = new Func<RegexFindOptimizations.FixedDistanceSet, bool>((fds) =>
+                {
+                    return fds switch
+                    {
+                        // anything above 4 uint16 chars is generally slower than DFA
+                        { Chars: not null } => fds.Chars.Length > 4,
+                        { Range: not null } => false,
+                        { Set: not null } => true,
+                        _ => false
+                    };
+                });
+                // a DFA is sometimes 10x-100x faster than the optimizations
+                // the "IsUseful" is harming the engine here
+                _findOpts = findOptimizations switch
+                {
+                    { FindMode: FindNextStartingPositionMode.FixedDistanceString_LeftToRight } => findOptimizations,
+                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } =>
+                        findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke)? null : findOptimizations,
+                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } => setIsTooCommon(
+                        findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations,
+                    _ => findOptimizations // TODO: unsure which options are left here
+                };
             }
 
             // Determine the number of initial states. If there's no anchor, only the default previous
@@ -199,6 +225,9 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             // The loops below and how character kinds are calculated assume that the "general" character kind is zero
             Debug.Assert(CharKind.General == 0);
 
+            // Assign dead state id
+            _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
+
             // Create the initial states for the original pattern.
             var initialStates = new MatchingState<TSet>[statesCount];
             for (uint charKind = 0; charKind < initialStates.Length; charKind++)
@@ -448,8 +477,9 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                     input.Length;
 
                 bool done = currentState.NfaState is not null ?
-                    FindEndPositionDeltas<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
-                    FindEndPositionDeltas<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
+                    FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
+                    _findOpts is null ? FindEndPositionDeltasDFANoSkip<DfaStateHandler, TInputReader, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
+                    FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
@@ -483,7 +513,75 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
             return endPos;
         }
 
+
+        /// <summary>
+        /// TODO: this is essentially a stripped down version when there's no good prefix optimizations
+        /// i don't trust the compiler to optimize this and it makes a
+        /// ~50% difference in performance with removing unnecessary checks alone
+        /// </summary>
+        private bool FindEndPositionDeltasDFANoSkip<TStateHandler, TInputReader,  TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+                ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+            where TStateHandler : struct, IStateHandler
+            where TInputReader : struct, IInputReader
+            where TNullabilityHandler : struct, INullabilityHandler
+        {
+            // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
+            int pos = posRef;
+            int endPos = endPosRef;
+            int endStateId = endStateIdRef;
+            int initialStatePos = initialStatePosRef;
+            int initialStatePosCandidate = initialStatePosCandidateRef;
+            try
+            {
+                // Loop through each character in the input, transitioning from state to state for each.
+                while (true)
+                {
+                    if (state.DfaStateId == _deadStateId)
+                    {
+                        return true;
+                    }
+
+                    int positionId = TInputReader.GetPositionId(this, input, pos);
+
+                    // If the state is nullable for the next character, meaning it accepts the empty string,
+                    // we found a potential end state.
+                    if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    {
+                        endPos = pos;
+                        endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
+                        initialStatePos = initialStatePosCandidate;
+
+                        // A match is known to exist.  If that's all we need to know, we're done.
+                        if (mode == RegexRunnerMode.ExistenceRequired)
+                        {
+                            return true;
+                        }
+                    }
+
+                    // If there is more input available try to transition with the next character.
+                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    {
+                        return false;
+                    }
+
+                    // We successfully transitioned, so update our current input index to match.
+                    pos++;
+                }
+            }
+            finally
+            {
+                // Write back the local copies of the ref values.
+                posRef = pos;
+                endPosRef = endPos;
+                endStateIdRef = endStateId;
+                initialStatePosRef = initialStatePos;
+                initialStatePosCandidateRef = initialStatePosCandidate;
+            }
+        }
+
+
         /// <summary>
+        /// TODO: this is a separate DFA function that takes advantage of short circuit array lookups
         /// Workhorse inner loop for <see cref="FindEndPosition"/>.  Consumes the <paramref name="input"/> character by character,
         /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
@@ -500,7 +598,7 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
         /// 0 if iteration completed because we reached an initial state.
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
-        private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+        private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
                 ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
@@ -518,22 +616,101 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
                 // Loop through each character in the input, transitioning from state to state for each.
                 while (true)
                 {
-                    StateFlags flags = TStateHandler.GetStateFlags(this, in state);
+                    if (state.DfaStateId == _deadStateId)
+                    {
+                        return true;
+                    }
 
                     // Check if currentState represents an initial state. If it does, call into any possible find optimizations
                     // to hopefully more quickly find the next possible starting location.
-                    if (flags.IsInitial())
+                    // if (flags.IsAccelerated())
+                    if (_canBeAcceleratedArray[state.DfaStateId])
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
                         {
                             return true;
                         }
-
                         initialStatePosCandidate = pos;
                     }
 
-                    // If the state is a dead end, such that we can't transition anywhere else, end the search.
-                    if (flags.IsDeadend())
+                    int positionId = TInputReader.GetPositionId(this, input, pos);
+
+                    // If the state is nullable for the next character, meaning it accepts the empty string,
+                    // we found a potential end state.
+                    if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    {
+                        endPos = pos;
+                        endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
+                        initialStatePos = initialStatePosCandidate;
+
+                        // A match is known to exist.  If that's all we need to know, we're done.
+                        if (mode == RegexRunnerMode.ExistenceRequired)
+                        {
+                            return true;
+                        }
+                    }
+
+                    // If there is more input available try to transition with the next character.
+                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    {
+                        return false;
+                    }
+
+                    // We successfully transitioned, so update our current input index to match.
+                    pos++;
+                }
+            }
+            finally
+            {
+                // Write back the local copies of the ref values.
+                posRef = pos;
+                endPosRef = endPos;
+                endStateIdRef = endStateId;
+                initialStatePosRef = initialStatePos;
+                initialStatePosCandidateRef = initialStatePosCandidate;
+            }
+        }
+
+        /// <summary>
+        /// TODO: this is the fallback NFA function
+        /// Workhorse inner loop for <see cref="FindEndPosition"/>.  Consumes the <paramref name="input"/> character by character,
+        /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
+        /// lazily building out the graph as needed.
+        /// </summary>
+        /// <remarks>
+        /// The <typeparamref name="TStateHandler"/> supplies the actual transitioning logic, controlling whether processing is
+        /// performed in DFA mode or in NFA mode.  However, it expects <paramref name="state"/> to be configured to match,
+        /// so for example if <typeparamref name="TStateHandler"/> is a <see cref="DfaStateHandler"/>, it expects the <paramref name="state"/>'s
+        /// <see cref="CurrentState.DfaStateId"/> to be non-negative and its <see cref="CurrentState.NfaState"/> to be null; vice versa for
+        /// <see cref="NfaStateHandler"/>.
+        /// </remarks>
+        /// <returns>
+        /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch.
+        /// 0 if iteration completed because we reached an initial state.
+        /// A negative value if iteration completed because we ran out of input or we failed to transition.
+        /// </returns>
+        private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+                ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+            where TStateHandler : struct, IStateHandler
+            where TInputReader : struct, IInputReader
+            where TFindOptimizationsHandler : struct, IInitialStateHandler
+            where TNullabilityHandler : struct, INullabilityHandler
+        {
+            // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
+            int pos = posRef;
+            int endPos = endPosRef;
+            int endStateId = endStateIdRef;
+            int initialStatePos = initialStatePosRef;
+            int initialStatePosCandidate = initialStatePosCandidateRef;
+            try
+            {
+                // Loop through each character in the input, transitioning from state to state for each.
+                while (true)
+                {
+                    StateFlags flags = TStateHandler.GetStateFlags(this, in state);
+
+                    // Dead end here means the set is empty
+                    if (state.NfaState!.NfaStateSet.Count == 0)
                     {
                         return true;
                     }
@@ -608,8 +785,8 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<ch
             {
                 // Run the DFA or NFA traversal backwards from the current point using the current state.
                 bool done = currentState.NfaState is not null ?
-                    FindStartPositionDeltas<NfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
-                    FindStartPositionDeltas<DfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
+                    FindStartPositionDeltasNFA<NfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
+                    FindStartPositionDeltasDFA<DfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
 
                 // If we found the starting position, we're done.
                 if (done)
@@ -635,7 +812,7 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<ch
         /// starting at <paramref name="i"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
         /// </summary>
-        private bool FindStartPositionDeltas<TStateHandler, TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
+        private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TNullabilityHandler : struct, INullabilityHandler
@@ -647,20 +824,66 @@ private bool FindStartPositionDeltas<TStateHandler, TInputReader, TNullabilityHa
                 // Loop backwards through each character in the input, transitioning from state to state for each.
                 while (true)
                 {
-                    StateFlags flags = TStateHandler.GetStateFlags(this, in state);
+                    int positionId = TInputReader.GetPositionId(this, input, pos - 1);
+                    // If the state accepts the empty string, we found a valid starting position.  Record it and keep going,
+                    // since we're looking for the earliest one to occur within bounds.
+                    if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
+                            TStateHandler.GetStateFlags(this, in state)))
+                    {
+                        lastStart = pos;
+                    }
+
+                    // If we are past the start threshold or if the state is a dead end, bail; we should have already
+                    // found a valid starting location.
+                    if (pos <= startThreshold || state.DfaStateId == _deadStateId)
+                    {
+                        Debug.Assert(lastStart != -1);
+                        return true;
+                    }
+
+                    // Try to transition with the next character, the one before the current position.
+                    if (!TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    {
+                        // Return false to indicate the search didn't finish.
+                        return false;
+                    }
+
+                    // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input.
+                    pos--;
+                }
+            }
+            finally
+            {
+                // Write back the local copies of the ref values.
+                i = pos;
+            }
+        }
 
+        private bool FindStartPositionDeltasNFA<TStateHandler, TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
+            where TStateHandler : struct, IStateHandler
+            where TInputReader : struct, IInputReader
+            where TNullabilityHandler : struct, INullabilityHandler
+        {
+            // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
+            int pos = i;
+            try
+            {
+                // Loop backwards through each character in the input, transitioning from state to state for each.
+                while (true)
+                {
                     int positionId = TInputReader.GetPositionId(this, input, pos - 1);
 
                     // If the state accepts the empty string, we found a valid starting position.  Record it and keep going,
                     // since we're looking for the earliest one to occur within bounds.
-                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, flags))
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
+                            TStateHandler.GetStateFlags(this, in state)))
                     {
                         lastStart = pos;
                     }
 
                     // If we are past the start threshold or if the state is a dead end, bail; we should have already
                     // found a valid starting location.
-                    if (pos <= startThreshold || flags.IsDeadend())
+                    if (pos <= startThreshold || state.DfaStateId == _deadStateId)
                     {
                         Debug.Assert(lastStart != -1);
                         return true;
@@ -746,7 +969,7 @@ private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i,
 
                             int coreStateId = GetCoreStateId(targetStateId);
                             StateFlags flags = _stateFlagsArray[coreStateId];
-                            Debug.Assert(!flags.IsDeadend());
+                            Debug.Assert(coreStateId != _deadStateId);
 
                             if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind<TInputReader>(input, i + 1))))
                             {
@@ -1150,12 +1373,6 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher<
             public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
             {
                 SparseIntMap<int> stateSet = state.NfaState!.NfaStateSet;
-                if (stateSet.Count == 0)
-                {
-                    // In NFA state sets dead ends are never included. Instead an empty set of states represents a dead end.
-                    return StateFlags.IsDeadendFlag;
-                }
-                else
                 {
                     // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
                     // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index 6057827e1d53f..c0118d52553ff 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -22,8 +22,13 @@ internal static class SymbolicRegexThresholds
         /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex.
         /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing
         /// to create a new node and the graph is already or newly beyond this threshold.
+        /// TODO: summarize this
+        /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s
+        /// with an entirely different search-time algorithmic complexity
+        /// 100_000 isn't a really a high memory cost either,
+        /// i'd even put 1_000_000 on the table but that might push it for general purpose use
         /// </remarks>
-        internal const int NfaThreshold = 10_000;
+        internal const int NfaThreshold = 100_000;
 
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST

From 49607f42e3c4f24dfff979d04bb24b0f5ce04fc2 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Fri, 24 May 2024 15:13:52 +0300
Subject: [PATCH 02/63] off by one err

---
 .../Text/RegularExpressions/Symbolic/MintermClassifier.cs     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 9fd9f85ac4f96..3e97273e726f7 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -38,7 +38,7 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver)
         {
             Debug.Assert(minterms.Length > 0, "Requires at least");
 
-            var lookup = new int[ushort.MaxValue];
+            var lookup = new int[ushort.MaxValue + 1];
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
@@ -68,7 +68,7 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver)
             // TODO: this could be initialized more efficiently but it's
             // a fundamentally different design choice that preallocates more memory.
             // the minterm slice [1..] contains the ranges that should be really initialized
-            for (int i = 0; i < ushort.MaxValue; i++)
+            for (int i = 0; i <= ushort.MaxValue; i++)
             {
                 lookup[i] = anyCharacterToMintermId.Find(i);
             }

From 5ac29f36906e6afff5b3b1835d7155ff524e4bf6 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 26 May 2024 21:52:19 +0300
Subject: [PATCH 03/63] wip reversal optimizations

---
 .../src/System.Text.RegularExpressions.csproj |  2 +
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 72 +++++++++++++++++++
 .../Symbolic/SymbolicRegexMatcher.cs          | 30 +++++++-
 .../FunctionalTests/NonBacktrackingTests.cs   | 22 ++++++
 ...ystem.Text.RegularExpressions.Tests.csproj |  1 +
 5 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 2042b930fdd2c..cb9c7e35ff972 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,6 +5,8 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
+<!--    <NoWarn>IL2026;IL2075;IDE0059;CA1823</NoWarn>-->
+
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index d20fe6ef13bca..cf2cfa196398f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -50,6 +50,15 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// </summary>
         private bool[] _canBeAcceleratedArray;
 
+#if DEBUG
+        // private readonly Action<string> _wout = st =>
+        // {
+        //     var a_cons = System.Reflection.Assembly.Load("System.Console");
+        //     var t_cons = a_cons.GetType("System.Console")!;
+        //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
+        //     wl!.Invoke(null, [st]);
+        // };
+#endif
         /// <summary>
         /// The transition function for DFA mode.
         /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
@@ -162,6 +171,69 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
             return GetOrCreateState_NoLock(node, prevCharKind);
         }
 
+        /// <summary>
+        /// Optimized reversal state computation which takes skips the fixed length parts
+        /// </summary>
+        /// <param name="node"></param>
+        /// <returns></returns>
+        private (int, MatchingState<TSet>) CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
+        {
+            var pos = 0;
+            var current = node;
+            var canLoop = true;
+            var incrPos = new Func<(int, SymbolicRegexNode<TSet>), (bool, SymbolicRegexNode<TSet>)>(value =>
+            {
+                pos += value.Item1;
+                return (true, value.Item2);
+            });
+            var decrLoop = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(value =>
+            {
+                var concat = value;
+                var loop = concat._left;
+                switch (loop!._left!.Kind)
+                {
+                    case SymbolicRegexNodeKind.Singleton:
+                        if (loop._lower == loop._upper)
+                        {
+                            pos += loop._lower;
+                            return (true, concat._right!);
+                        }
+                        if (loop._lower > 0)
+                        {
+                            var delta = loop._upper - loop._lower;
+                            var newLeft = _builder.CreateLoop(loop._left, loop.IsLazy, 0, delta);
+                            var newNode = _builder.CreateConcat(newLeft, concat._right!);
+                            pos += loop._lower;
+                            return (true, newNode);
+                        }
+                        return (false, concat);
+                    default:
+                        return (false, concat);
+                }
+            });
+            while (canLoop)
+            {
+#if DEBUG
+                // _wout($"{pos} {current._kind} l:{current._left!._kind} {current}");
+#endif
+                (bool loop, SymbolicRegexNode<TSet> next) = current switch
+                {
+                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} =>
+                        (true, current._right!),
+                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } =>
+                        (true, current._right!),
+                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} =>
+                        incrPos((1, current._right!)),
+                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
+                        decrLoop(current),
+                    _ => (false, current)
+                };
+                canLoop = loop;
+                current = next;
+            }
+            return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0, false));
+        }
+
         /// <summary>
         /// Create a state with given node and previous character context.
         /// </summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index a27bcf26cbd53..0f7bf2c01cd78 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -96,6 +96,8 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _reverseInitialStates;
 
+        private readonly (int, MatchingState<TSet>) _optimizedReversalState;
+
         /// <summary>Partition of the input space of sets.</summary>
         private readonly TSet[] _minterms;
 
@@ -172,6 +174,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
                 ((BitVectorSolver)(object)builder._solver)._classifier;
             _capsize = captureCount;
 
+
             // Initialization for fields in SymbolicRegexMatcher.Automata.cs
             _stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
             _stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
@@ -262,6 +265,9 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             }
             _reverseInitialStates = reverseInitialStates;
 
+            // Create optimized reversal
+            _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder));
+
             // Maps a minterm ID to a character kind
             uint CalculateMintermIdKind(int mintermId)
             {
@@ -776,9 +782,29 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<ch
 
             // Get the starting state for the reverse pattern. This depends on previous character (which, because we're
             // going backwards, is character number i).
-            var currentState = new CurrentState(_reverseInitialStates[GetCharKind<TInputReader>(input, i)]);
-
+            CurrentState currentState;
             int lastStart = -1; // invalid sentinel value
+            // if possible use optimized reversal instead
+            if (_optimizedReversalState.Item1 > 0)
+            {
+                i -= _optimizedReversalState.Item1;
+                currentState = new CurrentState(_optimizedReversalState.Item2);
+                // anchor variant may need context to be computed if nullable
+                if (_pattern._info.ContainsSomeAnchor && _canBeNullableArray[currentState.DfaStateId])
+                {
+                    int positionId = TInputReader.GetPositionId(this, input, i);
+                    if (TNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
+                            in currentState, positionId,
+                            DfaStateHandler.GetStateFlags(this, in currentState)))
+                    {
+                        lastStart = i;
+                    }
+                }
+            }
+            else
+            {
+                currentState = new CurrentState(_reverseInitialStates[GetCharKind<TInputReader>(input, i)]);
+            }
 
             // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
             while (true)
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs
new file mode 100644
index 0000000000000..501df78391690
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs
@@ -0,0 +1,22 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections;
+using System.Collections.Generic;
+using Xunit;
+
+namespace System.Text.RegularExpressions.Tests
+{
+    /// <summary>
+    /// TODO: Create tests here later
+    /// </summary>
+    public static partial class NonBacktrackingTests
+    {
+
+        // [Fact]
+        // public static void Test()
+        // {
+        // }
+
+    }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj
index dbab47f63d097..afdd6f1e51f24 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj
@@ -19,6 +19,7 @@
     <Compile Include="GroupCollectionTests.cs" />
     <Compile Include="MatchCollectionTests.cs" />
     <Compile Include="MonoRegexTests.cs" />
+    <Compile Include="NonBacktrackingTests.cs" />
     <Compile Include="Regex.CompileToAssembly.Tests.cs" />
     <Compile Include="Regex.Ctor.Tests.cs" />
     <Compile Include="Regex.Cache.Tests.cs" />

From e440dec535fba3c368f2e1e9830d7473ccb141b1 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 27 May 2024 00:28:55 +0300
Subject: [PATCH 04/63] removing unnecessary overhead

---
 .../src/System.Text.RegularExpressions.csproj |   3 +-
 .../Symbolic/MatchingState.cs                 |   3 +-
 .../Symbolic/MintermClassifier.cs             |   1 +
 .../RegularExpressions/Symbolic/StateFlags.cs |  21 +++-
 .../Symbolic/SymbolicRegexInfo.cs             |  21 ++--
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |   1 +
 .../Symbolic/SymbolicRegexMatcher.cs          | 100 +++++++++++++-----
 .../Symbolic/SymbolicRegexNode.cs             |   3 +-
 8 files changed, 112 insertions(+), 41 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index cb9c7e35ff972..a6f7119d2fd2f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,8 +5,7 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-<!--    <NoWarn>IL2026;IL2075;IDE0059;CA1823</NoWarn>-->
-
+<!--    <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060</NoWarn>-->
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 3c3029fb5a451..da7128b464da5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -99,8 +99,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
         internal bool IsNullableFor(uint nextCharKind)
         {
             Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
-            uint context = CharKind.Context(PrevCharKind, nextCharKind);
-            return Node.IsNullableFor(context);
+            return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind));
         }
 
         /// <summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 3e97273e726f7..3810f35f69f84 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -81,5 +81,6 @@ public int GetMintermID(int c)
         {
             return _lookup[c];
         }
+        public int[] Lookup => _lookup;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
index cef4fdfc1ed13..990eb4807c7f1 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
@@ -1,6 +1,8 @@
 ﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Runtime.CompilerServices;
+
 namespace System.Text.RegularExpressions.Symbolic
 {
     /// <summary>
@@ -13,6 +15,7 @@ namespace System.Text.RegularExpressions.Symbolic
     [Flags]
     internal enum StateFlags : byte
     {
+        None = 0,
         IsInitialFlag = 1,
         IsNullableFlag = 4,
         CanBeNullableFlag = 8,
@@ -25,10 +28,18 @@ internal enum StateFlags : byte
     /// </summary>
     internal static class StateFlagsExtensions
     {
-        internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != 0;
-        internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != 0;
-        internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != 0;
-        internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != 0;
-        internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != 0;
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None;
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != StateFlags.None;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
index ff95195292bfa..750fbed4774bf 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
@@ -17,6 +17,7 @@ namespace System.Text.RegularExpressions.Symbolic
         private const uint IsHighPriorityNullableMask = 64;
         private const uint ContainsEffectMask = 128;
         private const uint ContainsLineAnchorMask = 256;
+        private const uint ContainsEndZAnchorMask = 512;
 
         private readonly uint _info;
 
@@ -26,7 +27,7 @@ private static SymbolicRegexInfo Create(
             bool isAlwaysNullable = false, bool canBeNullable = false,
             bool startsWithLineAnchor = false, bool containsLineAnchor = false,
             bool startsWithSomeAnchor = false, bool containsSomeAnchor = false,
-            bool isHighPriorityNullable = false, bool containsEffect = false)
+            bool isHighPriorityNullable = false, bool containsEffect = false, bool containsEndZAnchor = false)
         {
             // Assert that the expected implications hold. For example, every node that contains a line anchor
             // must also be marked as containing some anchor.
@@ -43,7 +44,8 @@ private static SymbolicRegexInfo Create(
                 (startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) |
                 (containsSomeAnchor ? ContainsSomeAnchorMask : 0) |
                 (isHighPriorityNullable ? IsHighPriorityNullableMask : 0) |
-                (containsEffect ? ContainsEffectMask : 0));
+                (containsEffect ? ContainsEffectMask : 0) |
+                (containsEndZAnchor ? ContainsEndZAnchorMask : 0));
         }
 
         public bool IsNullable => (_info & IsAlwaysNullableMask) != 0;
@@ -53,7 +55,6 @@ private static SymbolicRegexInfo Create(
         public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
 
         public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
-
         public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
 
         public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
@@ -63,6 +64,7 @@ private static SymbolicRegexInfo Create(
         public bool IsHighPriorityNullable => (_info & IsHighPriorityNullableMask) != 0;
 
         public bool ContainsEffect => (_info & ContainsEffectMask) != 0;
+        public bool ContainsEndZAnchor => (_info & ContainsEndZAnchorMask) != 0;
 
         /// <summary>
         /// Used for any node that acts as an epsilon, i.e., something that always matches the empty string.
@@ -77,13 +79,15 @@ public static SymbolicRegexInfo Epsilon() =>
         /// Used for all anchors.
         /// </summary>
         /// <param name="isLineAnchor">whether this anchor is a line anchor</param>
-        public static SymbolicRegexInfo Anchor(bool isLineAnchor) =>
+        /// <param name="isEndZAnchor">whether this anchor is an end Z anchor</param>
+        public static SymbolicRegexInfo Anchor(bool isLineAnchor, bool isEndZAnchor) =>
             Create(
                 canBeNullable: true,
                 startsWithLineAnchor: isLineAnchor,
                 containsLineAnchor: isLineAnchor,
                 startsWithSomeAnchor: true,
-                containsSomeAnchor: true);
+                containsSomeAnchor: true,
+                containsEndZAnchor: isEndZAnchor);
 
         /// <summary>
         /// The alternation remains high priority nullable if the left alternative is so.
@@ -99,7 +103,8 @@ public static SymbolicRegexInfo Alternate(SymbolicRegexInfo left_info, SymbolicR
                 startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor,
                 containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
                 isHighPriorityNullable: left_info.IsHighPriorityNullable,
-                containsEffect: left_info.ContainsEffect || right_info.ContainsEffect);
+                containsEffect: left_info.ContainsEffect || right_info.ContainsEffect,
+                containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor);
 
         /// <summary>
         /// Concatenation remains high priority nullable if both left and right are so.
@@ -115,7 +120,9 @@ public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRege
                 startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
                 containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
                 isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable,
-                containsEffect: left_info.ContainsEffect || right_info.ContainsEffect);
+                containsEffect: left_info.ContainsEffect || right_info.ContainsEffect,
+                containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor
+                );
 
         /// <summary>
         /// Inherits anchor visibility from the loop body.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index cf2cfa196398f..06614dfd34f29 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -4,6 +4,7 @@
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
 using System.Threading;
 
 namespace System.Text.RegularExpressions.Symbolic
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 0f7bf2c01cd78..f55a8fcd752c2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -84,6 +84,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>TODO: summarize</summary>
         private readonly int _deadStateId;
 
+        /// <summary>TODO: summarize</summary>
+        private readonly bool _containsAnyAnchor;
+
         /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _initialStates;
@@ -230,6 +233,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
 
             // Assign dead state id
             _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
+            _containsAnyAnchor = _pattern._info.ContainsSomeAnchor;
 
             // Create the initial states for the original pattern.
             var initialStates = new MatchingState<TSet>[statesCount];
@@ -378,7 +382,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // the position of the last b: aacaaaabbbc.  It additionally records the position of the first a after
             // the c as the low boundary for the starting position.
             int matchStartLowBoundary, matchStartLengthMarker;
-            int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
+            int matchEnd = (_pattern._info.ContainsEndZAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
             {
                 (true, true, true) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
                 (true, true, false) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
@@ -418,7 +422,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             {
                 Debug.Assert(matchEnd >= startat - 1);
                 matchStart = matchEnd < startat ?
-                    startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch
+                    startat : (_pattern._info.ContainsEndZAnchor, _pattern._info.ContainsSomeAnchor) switch
                     {
                         (true, true) => FindStartPosition<FullInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
                         (true, false) => FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
@@ -484,7 +488,7 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
 
                 bool done = currentState.NfaState is not null ?
                     FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
-                    _findOpts is null ? FindEndPositionDeltasDFANoSkip<DfaStateHandler, TInputReader, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
+                    _findOpts is null ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
                     FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
@@ -525,16 +529,16 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
         /// i don't trust the compiler to optimize this and it makes a
         /// ~50% difference in performance with removing unnecessary checks alone
         /// </summary>
-        private bool FindEndPositionDeltasDFANoSkip<TStateHandler, TInputReader,  TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
-            where TStateHandler : struct, IStateHandler
-            where TInputReader : struct, IInputReader
-            where TNullabilityHandler : struct, INullabilityHandler
+        private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+                ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
         {
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
+            int final = length - 1;
+            Span<int> mtlookup = _mintermClassifier.Lookup.AsSpan();
             int endStateId = endStateIdRef;
+            int currStateId = startStateId;
             int initialStatePos = initialStatePosRef;
             int initialStatePosCandidate = initialStatePosCandidateRef;
             try
@@ -542,32 +546,49 @@ private bool FindEndPositionDeltasDFANoSkip<TStateHandler, TInputReader,  TNulla
                 // Loop through each character in the input, transitioning from state to state for each.
                 while (true)
                 {
-                    if (state.DfaStateId == _deadStateId)
+                    if (currStateId == _deadStateId)
                     {
                         return true;
                     }
-
-                    int positionId = TInputReader.GetPositionId(this, input, pos);
+                    int positionId = mtlookup[input[pos]];
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
-                    if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    if (_canBeNullableArray[currStateId])
                     {
-                        endPos = pos;
-                        endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
-                        initialStatePos = initialStatePosCandidate;
-
-                        // A match is known to exist.  If that's all we need to know, we're done.
-                        if (mode == RegexRunnerMode.ExistenceRequired)
+                        if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId)))
                         {
-                            return true;
+                            endPos = pos;
+                            endStateId = currStateId;
+                            initialStatePos = initialStatePosCandidate;
+
+                            // A match is known to exist.  If that's all we need to know, we're done.
+                            if (mode == RegexRunnerMode.ExistenceRequired)
+                            {
+                                return true;
+                            }
                         }
                     }
 
                     // If there is more input available try to transition with the next character.
-                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    if (pos >= final || !DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId))
                     {
-                        return false;
+                        if (pos < final)
+                        {
+                            return false;
+                        }
+                        // one off check for the final position
+                        // this is just to move it out of the hot loop
+                        if ((!_stateFlagsArray[currStateId].IsNullable() &&
+                             !_stateArray[currStateId]!.IsNullableFor(
+                                 GetPositionKind(positionId))))
+                        {
+                            return false;
+                        }
+                        endPos = pos;
+                        endStateId = currStateId;
+                        initialStatePos = initialStatePosCandidate;
+                        return mode == RegexRunnerMode.ExistenceRequired;
                     }
 
                     // We successfully transitioned, so update our current input index to match.
@@ -576,6 +597,8 @@ private bool FindEndPositionDeltasDFANoSkip<TStateHandler, TInputReader,  TNulla
             }
             finally
             {
+                // handle final pos here
+
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
@@ -790,11 +813,10 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<ch
                 i -= _optimizedReversalState.Item1;
                 currentState = new CurrentState(_optimizedReversalState.Item2);
                 // anchor variant may need context to be computed if nullable
-                if (_pattern._info.ContainsSomeAnchor && _canBeNullableArray[currentState.DfaStateId])
+                if (_containsAnyAnchor && _canBeNullableArray[currentState.DfaStateId])
                 {
-                    int positionId = TInputReader.GetPositionId(this, input, i);
                     if (TNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
-                            in currentState, positionId,
+                            in currentState, TInputReader.GetPositionId(this, input, i),
                             DfaStateHandler.GetStateFlags(this, in currentState)))
                     {
                         lastStart = i;
@@ -1239,6 +1261,36 @@ public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref Cur
                 return false;
             }
 
+            /// <summary>Take the transition to the next DFA state without paying for the NFA structure</summary>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
+                int mintermId)
+            {
+                Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}.");
+                // Use the mintermId for the character being read to look up which state to transition to.
+                // If that state has already been materialized, move to it, and we're done. If that state
+                // hasn't been materialized, try to create it; if we can, move to it, and we're done.
+                int nextStateId = matcher._dfaDelta[matcher.DeltaOffset(state, mintermId)];
+                if (nextStateId > 0)
+                {
+                    // There was an existing DFA transition to some state. Move to it and
+                    // return that we're still operating as a DFA and can keep going.
+                    state = nextStateId;
+                    return true;
+                }
+
+                if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId,
+                        matcher.DeltaOffset(state, mintermId),
+                        checkThreshold: true, out MatchingState<TSet>? nextState))
+                {
+                    // We were able to create a new DFA transition to some state. Move to it and
+                    // return that we're still operating as a DFA and can keep going.
+                    state = nextState.Id;
+                    return true;
+                }
+                return false;
+            }
+
             /// <summary>
             /// Gets context independent state information:
             /// - whether this is an initial state
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
index a138c819be00f..4309054c354e6 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
@@ -396,7 +396,8 @@ SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
                 SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
             return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is
                     SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
-                    SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor));
+                    SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor,
+                        kind is SymbolicRegexNodeKind.EndAnchorZ));
         }
 
         #endregion

From 627fd9099ac66b10f28cd1fff15ba697827ab829 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 27 May 2024 00:58:10 +0300
Subject: [PATCH 05/63] handle final position correctly

---
 .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index f55a8fcd752c2..84f31bec2028a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -488,7 +488,7 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
 
                 bool done = currentState.NfaState is not null ?
                     FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
-                    _findOpts is null ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
+                    _findOpts is null && pos < input.Length - 1 ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
                     FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
@@ -561,7 +561,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                             endPos = pos;
                             endStateId = currStateId;
                             initialStatePos = initialStatePosCandidate;
-
                             // A match is known to exist.  If that's all we need to know, we're done.
                             if (mode == RegexRunnerMode.ExistenceRequired)
                             {
@@ -597,8 +596,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
             }
             finally
             {
-                // handle final pos here
-
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;

From 7ae644012f8d058160a2e062ffbf273d59aa0b27 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 27 May 2024 17:07:02 +0300
Subject: [PATCH 06/63] edge case workarounds, tests should be ok again

---
 .../Symbolic/SymbolicRegexMatcher.cs          | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 84f31bec2028a..99d34fba06ace 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -87,6 +87,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>TODO: summarize</summary>
         private readonly bool _containsAnyAnchor;
 
+        /// <summary>TODO: summarize</summary>
+        private readonly bool _containsEndZAnchor;
+
         /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _initialStates;
@@ -233,7 +236,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
 
             // Assign dead state id
             _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
+
+            // Assign edge case info for quick lookup
             _containsAnyAnchor = _pattern._info.ContainsSomeAnchor;
+            _containsEndZAnchor = _pattern._info.ContainsEndZAnchor;
 
             // Create the initial states for the original pattern.
             var initialStates = new MatchingState<TSet>[statesCount];
@@ -488,7 +494,8 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
 
                 bool done = currentState.NfaState is not null ?
                     FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
-                    _findOpts is null && pos < input.Length - 1 ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
+                    // If there are no edge cases then use the quicker loop
+                    (_findOpts is null && !_containsEndZAnchor && pos < input.Length - 1) ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
                     FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
@@ -570,20 +577,28 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                     }
 
                     // If there is more input available try to transition with the next character.
-                    if (pos >= final || !DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId))
+                    // Note: the order here is important so the transition gets taken
+                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= final)
                     {
+                        // _wout($"end1: {_stateArray[currStateId]}");
                         if (pos < final)
                         {
                             return false;
                         }
+                        pos++;
+                        // _wout($"end: {_stateArray[currStateId]}");
+                        // final transition
+                        // DfaStateHandler.TryTakeDFATransition(this, ref currStateId, -1);
+                        //
                         // one off check for the final position
                         // this is just to move it out of the hot loop
                         if ((!_stateFlagsArray[currStateId].IsNullable() &&
                              !_stateArray[currStateId]!.IsNullableFor(
-                                 GetPositionKind(positionId))))
+                                 GetPositionKind(-1))))
                         {
                             return false;
                         }
+                        // the end position (-1) was nullable
                         endPos = pos;
                         endStateId = currStateId;
                         initialStatePos = initialStatePosCandidate;

From 383f3e5fb4c58bd455d622528c842fa5a1dc3ecd Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 27 May 2024 18:29:38 +0300
Subject: [PATCH 07/63] optimizing lookup initialization

---
 .../Symbolic/MintermClassifier.cs             | 16 +++--
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 65 ++++++++++---------
 .../Symbolic/SymbolicRegexThresholds.cs       |  1 +
 .../tests/UnitTests/SymbolicRegexTests.cs     | 14 ++++
 4 files changed, 62 insertions(+), 34 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 3810f35f69f84..799814ee7b9c2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -65,12 +65,18 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver)
                 anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
             }
 
-            // TODO: this could be initialized more efficiently but it's
-            // a fundamentally different design choice that preallocates more memory.
-            // the minterm slice [1..] contains the ranges that should be really initialized
-            for (int i = 0; i <= ushort.MaxValue; i++)
+            // assign minterm category for every char
+            // unused characters in minterm 0 get mapped to zero
+            for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
-                lookup[i] = anyCharacterToMintermId.Find(i);
+                // precompute all assigned minterm categories
+                (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                foreach ((uint start, uint end) in mintermRanges)
+                {
+                    // assign character ranges in bulk
+                    Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
+                    slice.Fill(mintermId);
+                }
             }
             _lookup = lookup;
         }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 06614dfd34f29..a6f86e09ffacf 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -173,43 +173,50 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
         }
 
         /// <summary>
-        /// Optimized reversal state computation which takes skips the fixed length parts
+        /// Optimized reversal state computation during construction which
+        /// skips the fixed length parts of reversal
+        /// e.g. for the pattern abc.*def
+        /// 1) the end is found at abc.*def|
+        /// 2) the reversal starts at abc.*|
         /// </summary>
-        /// <param name="node"></param>
-        /// <returns></returns>
+        /// <param name="node">reversed initial pattern</param>
+        /// <returns>returns n of chars to skip and adjusted reversal start state</returns>
         private (int, MatchingState<TSet>) CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
         {
-            var pos = 0;
-            var current = node;
-            var canLoop = true;
-            var incrPos = new Func<(int, SymbolicRegexNode<TSet>), (bool, SymbolicRegexNode<TSet>)>(value =>
+            int pos = 0;
+            SymbolicRegexNode<TSet>? current = node;
+            bool canLoop = true;
+            var addSingleton = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
             {
-                pos += value.Item1;
-                return (true, value.Item2);
+                pos += 1;
+                // continue with next concat
+                return (true, concatNode._right!);
             });
-            var decrLoop = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(value =>
+            var addFixedLengthLoop = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
             {
-                var concat = value;
-                var loop = concat._left;
-                switch (loop!._left!.Kind)
+                SymbolicRegexNode<TSet>? loopNode = concatNode._left;
+                if (loopNode is { _lower: <= 0 })
+                {
+                    return (false, concatNode);
+                }
+                switch (loopNode!._left!.Kind)
                 {
                     case SymbolicRegexNodeKind.Singleton:
-                        if (loop._lower == loop._upper)
-                        {
-                            pos += loop._lower;
-                            return (true, concat._right!);
-                        }
-                        if (loop._lower > 0)
+
+                        if (loopNode._lower == loopNode._upper)
                         {
-                            var delta = loop._upper - loop._lower;
-                            var newLeft = _builder.CreateLoop(loop._left, loop.IsLazy, 0, delta);
-                            var newNode = _builder.CreateConcat(newLeft, concat._right!);
-                            pos += loop._lower;
-                            return (true, newNode);
+                            pos += loopNode._lower;
+                            // the entire loop is fixed, continue
+                            return (true, concatNode._right!);
                         }
-                        return (false, concat);
+                        // subtract the fixed part of the loop
+                        int loopRemainder = loopNode._upper - loopNode._lower;
+                        SymbolicRegexNode<TSet> newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
+                        SymbolicRegexNode<TSet> newNode = _builder.CreateConcat(newLeft, concatNode._right!);
+                        pos += loopNode._lower;
+                        return (true, newNode);
                     default:
-                        return (false, concat);
+                        return (false, concatNode);
                 }
             });
             while (canLoop)
@@ -224,15 +231,15 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
                     {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } =>
                         (true, current._right!),
                     {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} =>
-                        incrPos((1, current._right!)),
+                        addSingleton(current),
                     {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
-                        decrLoop(current),
+                        addFixedLengthLoop(current),
                     _ => (false, current)
                 };
                 canLoop = loop;
                 current = next;
             }
-            return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0, false));
+            return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0));
         }
 
         /// <summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index c0118d52553ff..b00f2631c3aa2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -33,6 +33,7 @@ internal static class SymbolicRegexThresholds
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
         /// after the AST has been anlayzed for safe handling.
+        /// TODO: this is perhaps too conservative, consider raising this
         /// <remarks>
         /// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
         /// This default value may be overridden with the AppContext data
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
index 0e7046a04f36d..cbddba878edc2 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
@@ -253,5 +253,19 @@ public void SafeThresholdConfigTest(object? newThresholdData, int expectedThresh
             AppContext.SetData(SymbolicRegexThresholds.SymbolicRegexSafeSizeThreshold_ConfigKeyName, null);
             Assert.Equal(expectedThreshold, k);
         }
+
+        [Fact]
+        public static void OptimizedReversalTests()
+        {
+            var charSetSolver = new CharSetSolver();
+            var bddBuilder = new SymbolicRegexBuilder<BDD>(charSetSolver, charSetSolver);
+            var converter = new RegexNodeConverter(bddBuilder, null);
+            const RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture;
+            RegexNode tree = RegexParser.Parse("abc.*def", options, CultureInfo.CurrentCulture).Root;
+            SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
+            // todo: import the matcher here or use something else?
+            // var matcher = SymbolicRegexMatcher.Create(bddBuilder, rootNode, 0, null, TimeSpan.MaxValue);
+
+        }
     }
 }

From 5a2636c9248d9bf115056c3cca255083e688afad Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 28 May 2024 04:08:23 +0300
Subject: [PATCH 08/63] more dfa overhead removed

---
 .../Symbolic/BitVectorSolver.cs               |  4 +-
 .../Symbolic/MatchingState.cs                 | 17 ++++
 .../Symbolic/MintermClassifier.cs             | 34 +-------
 .../Symbolic/RegexNodeConverter.cs            | 84 +++++++++++++++++++
 .../Symbolic/SymbolicRegexMatcher.cs          | 48 ++++++-----
 .../Symbolic/SymbolicRegexRunnerFactory.cs    |  5 +-
 .../Symbolic/UInt64Solver.cs                  |  4 +-
 7 files changed, 140 insertions(+), 56 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs
index 09db2948d717b..b30527871e2bb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs
@@ -10,11 +10,11 @@ internal sealed class BitVectorSolver : ISolver<BitVector>
         internal readonly MintermClassifier _classifier;
         private readonly BitVector[] _mintermVectors;
 
-        public BitVectorSolver(BDD[] minterms, CharSetSolver solver)
+        public BitVectorSolver(BDD[] minterms)
         {
             _minterms = minterms;
 
-            _classifier = new MintermClassifier(minterms, solver);
+            _classifier = new MintermClassifier(minterms);
 
             var singleBitVectors = new BitVector[minterms.Length];
             for (int i = 0; i < singleBitVectors.Length; i++)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index da7128b464da5..9624e0fd143bd 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -14,7 +14,18 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         {
             Node = node;
             PrevCharKind = prevCharKind;
+            // this is significantly cheaper to initialize once
+            // than to pay for it on every call
+            if (Node.CanBeNullable)
+            {
+                _nullabilityLookup = new bool[5];
+                for (uint nk = 0; nk <= 4; nk++)
+                {
+                    _nullabilityLookup[nk] = IsNullableForInit(nk);
+                }
+            }
         }
+        private readonly bool[]? _nullabilityLookup;
 
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
@@ -97,6 +108,12 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
+        {
+            return (_nullabilityLookup is not null && _nullabilityLookup[nextCharKind]);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal bool IsNullableForInit(uint nextCharKind)
         {
             Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
             return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind));
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 799814ee7b9c2..1132b3881efc4 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -20,51 +20,23 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </remarks>
     internal sealed class MintermClassifier
     {
-        /// <summary>An array used when there's a single minterm, in order to map every ASCII character to it trivially.</summary>
-        // private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
+        /// <summary>An array used to map characters to minterms</summary>
         private readonly int[] _lookup;
 
-        // /// <summary>A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.</summary>
-        // /// <remarks>
-        // /// The use of a multi-terminal BDD here is an implementation detail.  Should we decide its important to optimize non-ASCII inputs further,
-        // /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
-        // /// </remarks>
-        // private readonly BDD _nonAscii;
-
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
-        /// <param name="solver">The character set solver to use.</param>
-        public MintermClassifier(BDD[] minterms, CharSetSolver solver)
+        public MintermClassifier(BDD[] minterms)
         {
             Debug.Assert(minterms.Length > 0, "Requires at least");
 
-            var lookup = new int[ushort.MaxValue + 1];
+            int[] lookup = new int[ushort.MaxValue + 1];
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
-                // For ASCII, use an array containing all zeros.  For non-ASCII, use a BDD that maps everything to 0.
                 _lookup = lookup;
-                // _nonAscii = solver.ReplaceTrue(BDD.True, 0);
                 return;
             }
 
-            // Create a multi-terminal BDD for mapping any character to its associated minterm.
-            BDD anyCharacterToMintermId = BDD.False;
-            for (int i = 0; i < minterms.Length; i++)
-            {
-                // Each supplied minterm BDD decides whether a given character maps to it or not.
-                // We need to combine all of those into a multi-terminal BDD that decides which
-                // minterm a character maps to.  To do that, we take each minterm BDD and replace
-                // its True result with the ID of the minterm, such that a character that would
-                // have returned True for that BDD now returns the minterm ID.
-                BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);
-
-                // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
-                // is valid because every character belongs to exactly one minterm and thus will
-                // only map to an ID instead of False in exactly one of the input BDDs.
-                anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
-            }
-
             // assign minterm category for every char
             // unused characters in minterm 0 get mapped to zero
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
index 9194ca00c971c..9e6c25f41a3d4 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
@@ -527,5 +527,89 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
                 }
             }
         }
+
+
+        /// <summary>
+        /// attempt to remove anchors when possible since it reduces overhead
+        /// more rewrites could be tried but it's important to preserve PCRE semantics
+        /// </summary>
+        /// <param name="builder"></param>
+        /// <param name="rootNode"></param>
+        /// <returns></returns>
+        internal static SymbolicRegexNode<BDD> ApplyRootRewrites(SymbolicRegexBuilder<BDD> builder, SymbolicRegexNode<BDD> rootNode)
+        {
+            // only consider removing anchors, otherwise bail
+            if (!rootNode._info.ContainsSomeAnchor) return rootNode;
+
+            // Func<string, bool> _wout = st =>
+            // {
+            //     var a_cons = System.Reflection.Assembly.Load("System.Console");
+            //     var t_cons = a_cons.GetType("System.Console")!;
+            //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
+            //     wl!.Invoke(null, [st]);
+            //     return true;
+            // };
+
+            SymbolicRegexNode<BDD> ApplyRewrites(SymbolicRegexNode<BDD> node)
+            {
+                // Guard against stack overflow due to deep recursion
+                if (!StackHelper.TryEnsureSufficientExecutionStack())
+                {
+                    return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node));
+                }
+
+                var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver);
+
+                switch (node._kind)
+                {
+                    case SymbolicRegexNodeKind.Concat:
+                        // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}");
+                        switch (node._left!._kind)
+                        {
+                            case SymbolicRegexNodeKind.CaptureStart:
+                                return builder.CreateConcat(node._left, ApplyRewrites(node._right!));
+                            case SymbolicRegexNodeKind.BoundaryAnchor:
+                                return node._right! switch
+                                {
+                                    // \b\w{1,}.. -> \w{1,}
+                                    // anchor to the left can be removed
+                                    {
+                                        _kind: SymbolicRegexNodeKind.Concat, _left:
+                                        {
+                                            _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue
+
+                                        } wordLoop
+                                    }
+                                     when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!),
+                                    _ => node
+                                };
+                            case SymbolicRegexNodeKind.Loop:
+                                var loopnode = node._left!;
+                                // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case
+                                bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1;
+                                bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl);
+                                return node._right! switch
+                                {
+                                    // anchor to the right can be removed
+                                    {
+                                        _kind: SymbolicRegexNodeKind.Concat,
+                                        _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor,
+                                        _right._kind: SymbolicRegexNodeKind.CaptureEnd
+                                    } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)),
+                                    _ => node
+                                };
+                        }
+                        return node;
+
+
+                    default:
+                        return node;
+                }
+            }
+
+            SymbolicRegexNode<BDD> rewritten = ApplyRewrites(rootNode);
+            // _wout(rewritten.ToString());
+            return rewritten;
+        }
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 99d34fba06ace..d2e333b45f0f8 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -205,6 +205,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             {
                 var setIsTooCommon = new Func<RegexFindOptimizations.FixedDistanceSet, bool>((fds) =>
                 {
+                    // _wout($"s{fds.Set}");
+                    // _wout($"c{fds.Chars.AsSpan()}");
                     return fds switch
                     {
                         // anything above 4 uint16 chars is generally slower than DFA
@@ -225,6 +227,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
                         findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations,
                     _ => findOptimizations // TODO: unsure which options are left here
                 };
+                // _wout($"{findOptimizations.FindMode}");
+                // _wout($"o{_findOpts}");
             }
 
             // Determine the number of initial states. If there's no anchor, only the default previous
@@ -488,14 +492,29 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                 // still check the timeout now and again to provide some semblance of the behavior a developer experiences with
                 // the backtracking engines.  We can, however, choose a large number here, since it's not actually needed for security.
                 const int CharsPerTimeoutCheck = 1_000;
+                // TODO: maybe this should be for NFA mode only
                 int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
                     pos + CharsPerTimeoutCheck :
                     input.Length;
 
+                if (pos == input.Length && currentState.NfaState is null)
+                {
+                    if ((!_stateFlagsArray[currentState.DfaStateId].IsNullable() &&
+                         !_stateArray[currentState.DfaStateId]!.IsNullableFor(
+                             GetPositionKind(-1))))
+                    {
+                        break;
+                    }
+                    // the end position (-1) was nullable
+                    endPos = pos;
+                    endStateId = currentState.DfaStateId;
+                    break;
+                }
+
                 bool done = currentState.NfaState is not null ?
                     FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
                     // If there are no edge cases then use the quicker loop
-                    (_findOpts is null && !_containsEndZAnchor && pos < input.Length - 1) ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
+                    _findOpts is null && !_containsEndZAnchor ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
                     FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
@@ -526,6 +545,8 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
 
             // Check whether there's a fixed-length marker for the current state.  If there is, we can
             // use that length to optimize subsequent matching phases.
+            // TODO: profiling shows around 4% gets lost here with high-match count,
+            // if not for the endZ anchor this could be cached with minterm lookup
             matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind<TInputReader>(input, endPos)) : -1;
             return endPos;
         }
@@ -536,18 +557,15 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
         /// i don't trust the compiler to optimize this and it makes a
         /// ~50% difference in performance with removing unnecessary checks alone
         /// </summary>
-        private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+        private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
                 ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
         {
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            int final = length - 1;
             Span<int> mtlookup = _mintermClassifier.Lookup.AsSpan();
             int endStateId = endStateIdRef;
             int currStateId = startStateId;
-            int initialStatePos = initialStatePosRef;
-            int initialStatePosCandidate = initialStatePosCandidateRef;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
@@ -557,17 +575,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                     {
                         return true;
                     }
-                    int positionId = mtlookup[input[pos]];
+                    // int positionId = mtlookup[input[pos]];
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
                     if (_canBeNullableArray[currStateId])
                     {
-                        if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId)))
+                        if (_stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]])))
                         {
                             endPos = pos;
                             endStateId = currStateId;
-                            initialStatePos = initialStatePosCandidate;
                             // A match is known to exist.  If that's all we need to know, we're done.
                             if (mode == RegexRunnerMode.ExistenceRequired)
                             {
@@ -578,18 +595,13 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
 
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition gets taken
-                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= final)
+                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]]) || pos >= lengthMinus1)
                     {
-                        // _wout($"end1: {_stateArray[currStateId]}");
-                        if (pos < final)
+                        pos++;
+                        if (pos < input.Length)
                         {
                             return false;
                         }
-                        pos++;
-                        // _wout($"end: {_stateArray[currStateId]}");
-                        // final transition
-                        // DfaStateHandler.TryTakeDFATransition(this, ref currStateId, -1);
-                        //
                         // one off check for the final position
                         // this is just to move it out of the hot loop
                         if ((!_stateFlagsArray[currStateId].IsNullable() &&
@@ -601,7 +613,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                         // the end position (-1) was nullable
                         endPos = pos;
                         endStateId = currStateId;
-                        initialStatePos = initialStatePosCandidate;
                         return mode == RegexRunnerMode.ExistenceRequired;
                     }
 
@@ -615,8 +626,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                 posRef = pos;
                 endPosRef = endPos;
                 endStateIdRef = endStateId;
-                initialStatePosRef = initialStatePos;
-                initialStatePosCandidateRef = initialStatePosCandidate;
+                initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
             }
         }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
index fea9518b79b51..ecd746ed6de87 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
@@ -21,6 +21,7 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
             var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping);
 
             SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root);
+            rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode);
 
             // Determine if the root node is supported for safe handling
             int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold();
@@ -40,8 +41,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
             BDD[] minterms = rootNode.ComputeMinterms(bddBuilder);
 
             _matcher = minterms.Length > 64 ?
-                SymbolicRegexMatcher<BitVector>.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) :
-                SymbolicRegexMatcher<ulong>.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms, charSetSolver), matchTimeout);
+                SymbolicRegexMatcher<BitVector>.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms), matchTimeout) :
+                SymbolicRegexMatcher<ulong>.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms), matchTimeout);
         }
 
         /// <summary>Creates a <see cref="RegexRunner"/> object.</summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs
index 7664d6d03aa4a..c65c00fd23413 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs
@@ -12,12 +12,12 @@ internal sealed class UInt64Solver : ISolver<ulong>
         private readonly BDD[] _minterms;
         internal readonly MintermClassifier _classifier;
 
-        public UInt64Solver(BDD[] minterms, CharSetSolver solver)
+        public UInt64Solver(BDD[] minterms)
         {
             Debug.Assert(minterms.Length <= 64);
 
             _minterms = minterms;
-            _classifier = new MintermClassifier(minterms, solver);
+            _classifier = new MintermClassifier(minterms);
 
             Full = minterms.Length == 64 ? ulong.MaxValue : ulong.MaxValue >> (64 - minterms.Length);
         }

From 57e5b8d80c45ffc52b8d04351e6ee256d9081037 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 28 May 2024 05:00:27 +0300
Subject: [PATCH 09/63] removed potential rewrite

---
 .../Symbolic/RegexNodeConverter.cs            | 165 +++++++++---------
 .../Symbolic/SymbolicRegexMatcher.cs          |   2 +-
 .../Symbolic/SymbolicRegexRunnerFactory.cs    |   2 +-
 3 files changed, 85 insertions(+), 84 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
index 9e6c25f41a3d4..88fc386b6956e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
@@ -529,87 +529,88 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
         }
 
 
-        /// <summary>
-        /// attempt to remove anchors when possible since it reduces overhead
-        /// more rewrites could be tried but it's important to preserve PCRE semantics
-        /// </summary>
-        /// <param name="builder"></param>
-        /// <param name="rootNode"></param>
-        /// <returns></returns>
-        internal static SymbolicRegexNode<BDD> ApplyRootRewrites(SymbolicRegexBuilder<BDD> builder, SymbolicRegexNode<BDD> rootNode)
-        {
-            // only consider removing anchors, otherwise bail
-            if (!rootNode._info.ContainsSomeAnchor) return rootNode;
-
-            // Func<string, bool> _wout = st =>
-            // {
-            //     var a_cons = System.Reflection.Assembly.Load("System.Console");
-            //     var t_cons = a_cons.GetType("System.Console")!;
-            //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
-            //     wl!.Invoke(null, [st]);
-            //     return true;
-            // };
-
-            SymbolicRegexNode<BDD> ApplyRewrites(SymbolicRegexNode<BDD> node)
-            {
-                // Guard against stack overflow due to deep recursion
-                if (!StackHelper.TryEnsureSufficientExecutionStack())
-                {
-                    return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node));
-                }
-
-                var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver);
-
-                switch (node._kind)
-                {
-                    case SymbolicRegexNodeKind.Concat:
-                        // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}");
-                        switch (node._left!._kind)
-                        {
-                            case SymbolicRegexNodeKind.CaptureStart:
-                                return builder.CreateConcat(node._left, ApplyRewrites(node._right!));
-                            case SymbolicRegexNodeKind.BoundaryAnchor:
-                                return node._right! switch
-                                {
-                                    // \b\w{1,}.. -> \w{1,}
-                                    // anchor to the left can be removed
-                                    {
-                                        _kind: SymbolicRegexNodeKind.Concat, _left:
-                                        {
-                                            _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue
-
-                                        } wordLoop
-                                    }
-                                     when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!),
-                                    _ => node
-                                };
-                            case SymbolicRegexNodeKind.Loop:
-                                var loopnode = node._left!;
-                                // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case
-                                bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1;
-                                bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl);
-                                return node._right! switch
-                                {
-                                    // anchor to the right can be removed
-                                    {
-                                        _kind: SymbolicRegexNodeKind.Concat,
-                                        _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor,
-                                        _right._kind: SymbolicRegexNodeKind.CaptureEnd
-                                    } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)),
-                                    _ => node
-                                };
-                        }
-                        return node;
-
-
-                    default:
-                        return node;
-                }
-            }
-
-            SymbolicRegexNode<BDD> rewritten = ApplyRewrites(rootNode);
-            // _wout(rewritten.ToString());
-            return rewritten;
-        }
+        // /// <summary>
+        // /// attempt to remove anchors when possible since it reduces overhead
+        // /// more rewrites could be tried but it's important to preserve PCRE semantics
+        // /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner
+        // /// </summary>
+        // /// <param name="builder"></param>
+        // /// <param name="rootNode"></param>
+        // /// <returns></returns>
+    //     internal static SymbolicRegexNode<BDD> ApplyRootRewrites(SymbolicRegexBuilder<BDD> builder, SymbolicRegexNode<BDD> rootNode)
+    //     {
+    //         // only consider removing anchors, otherwise bail
+    //         if (!rootNode._info.ContainsSomeAnchor) return rootNode;
+
+    //         // Func<string, bool> _wout = st =>
+    //         // {
+    //         //     var a_cons = System.Reflection.Assembly.Load("System.Console");
+    //         //     var t_cons = a_cons.GetType("System.Console")!;
+    //         //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
+    //         //     wl!.Invoke(null, [st]);
+    //         //     return true;
+    //         // };
+
+    //         SymbolicRegexNode<BDD> ApplyRewrites(SymbolicRegexNode<BDD> node)
+    //         {
+    //             // Guard against stack overflow due to deep recursion
+    //             if (!StackHelper.TryEnsureSufficientExecutionStack())
+    //             {
+    //                 return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node));
+    //             }
+
+    //             var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver);
+
+    //             switch (node._kind)
+    //             {
+    //                 case SymbolicRegexNodeKind.Concat:
+    //                     // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}");
+    //                     switch (node._left!._kind)
+    //                     {
+    //                         case SymbolicRegexNodeKind.CaptureStart:
+    //                             return builder.CreateConcat(node._left, ApplyRewrites(node._right!));
+    //                         case SymbolicRegexNodeKind.BoundaryAnchor:
+    //                             return node._right! switch
+    //                             {
+    //                                 // \b\w{1,}.. -> \w{1,}
+    //                                 // anchor to the left can be removed
+    //                                 {
+    //                                     _kind: SymbolicRegexNodeKind.Concat, _left:
+    //                                     {
+    //                                         _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue
+
+    //                                     } wordLoop
+    //                                 }
+    //                                  when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!),
+    //                                 _ => node
+    //                             };
+    //                         case SymbolicRegexNodeKind.Loop:
+    //                             var loopnode = node._left!;
+    //                             // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case
+    //                             bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1;
+    //                             bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl);
+    //                             return node._right! switch
+    //                             {
+    //                                 // anchor to the right can be removed
+    //                                 {
+    //                                     _kind: SymbolicRegexNodeKind.Concat,
+    //                                     _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor,
+    //                                     _right._kind: SymbolicRegexNodeKind.CaptureEnd
+    //                                 } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)),
+    //                                 _ => node
+    //                             };
+    //                     }
+    //                     return node;
+
+
+    //                 default:
+    //                     return node;
+    //             }
+    //         }
+
+    //         SymbolicRegexNode<BDD> rewritten = ApplyRewrites(rootNode);
+    //         // _wout(rewritten.ToString());
+    //         return rewritten;
+    //     }
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index d2e333b45f0f8..303fa200e1819 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -581,7 +581,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                     // we found a potential end state.
                     if (_canBeNullableArray[currStateId])
                     {
-                        if (_stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]])))
+                        if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]])))
                         {
                             endPos = pos;
                             endStateId = currStateId;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
index ecd746ed6de87..c046531f8a295 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
@@ -21,7 +21,7 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
             var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping);
 
             SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root);
-            rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode);
+            // rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode);
 
             // Determine if the root node is supported for safe handling
             int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold();

From 4d275dbf512ac2f83630d443458da9f1936a0153 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 28 May 2024 16:12:20 +0300
Subject: [PATCH 10/63] low memory variant

---
 .../Symbolic/MintermClassifier.cs             | 30 +++++++++++++++++--
 .../Symbolic/SymbolicRegexMatcher.cs          |  8 +++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 1132b3881efc4..eceef93abfe1b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -20,8 +20,10 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </remarks>
     internal sealed class MintermClassifier
     {
+        private static readonly int[] s_emptyLookup = new int[ushort.MaxValue + 1];
         /// <summary>An array used to map characters to minterms</summary>
         private readonly int[] _lookup;
+        private readonly bool _isAsciiOnly;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
@@ -29,16 +31,30 @@ public MintermClassifier(BDD[] minterms)
         {
             Debug.Assert(minterms.Length > 0, "Requires at least");
 
-            int[] lookup = new int[ushort.MaxValue + 1];
+
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
-                _lookup = lookup;
+                _lookup = s_emptyLookup;
                 return;
             }
 
+            // low memory variant could create an ascii-only array
+            // cheaper to iterate twice than allocate an array and potentially not use it
+            _isAsciiOnly = true;
+            for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+            {
+                (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                if (mintermRanges[^1].Item2 >= 128)
+                {
+                    _isAsciiOnly = false;
+                }
+            }
+
+
             // assign minterm category for every char
             // unused characters in minterm 0 get mapped to zero
+            int[] lookup = new int[_isAsciiOnly ? 128 : 65536];
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
                 // precompute all assigned minterm categories
@@ -57,8 +73,16 @@ public MintermClassifier(BDD[] minterms)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
+            if (_isAsciiOnly && (c >= 128))
+            {
+                return 0;
+            }
+            // high performance variant would use a span directly
+            // but this is not possible in low memory constraints
             return _lookup[c];
         }
-        public int[] Lookup => _lookup;
+
+        // [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        // public Span<int> LookupSpan() => _lookup.AsSpan();
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 303fa200e1819..e08bd315819f1 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -563,7 +563,8 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            Span<int> mtlookup = _mintermClassifier.Lookup.AsSpan();
+            // can only be used with full array
+            // Span<int> mtlookup = _mintermClassifier.Lookup.AsSpan();
             int endStateId = endStateIdRef;
             int currStateId = startStateId;
             try
@@ -576,12 +577,13 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                         return true;
                     }
                     // int positionId = mtlookup[input[pos]];
+                    int positionId = _mintermClassifier.GetMintermID(input[pos]);
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
                     if (_canBeNullableArray[currStateId])
                     {
-                        if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]])))
+                        if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId)))
                         {
                             endPos = pos;
                             endStateId = currStateId;
@@ -595,7 +597,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
 
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition gets taken
-                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]]) || pos >= lengthMinus1)
+                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= lengthMinus1)
                     {
                         pos++;
                         if (pos < input.Length)

From c35ed7e9be5756f4d02aa29b8df0ef137c5b10f0 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 28 May 2024 21:12:46 +0300
Subject: [PATCH 11/63] some kind of compromise between speed and memory

---
 .../Symbolic/MintermClassifier.cs             |  73 ++++++++---
 .../Symbolic/SymbolicRegexMatcher.cs          | 121 ++++++++++++++++--
 2 files changed, 164 insertions(+), 30 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index eceef93abfe1b..a2ea8dcaeb904 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -20,11 +20,19 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </remarks>
     internal sealed class MintermClassifier
     {
-        private static readonly int[] s_emptyLookup = new int[ushort.MaxValue + 1];
+        private static readonly byte[] s_emptyLookup = new byte[ushort.MaxValue + 1];
         /// <summary>An array used to map characters to minterms</summary>
-        private readonly int[] _lookup;
+        private readonly byte[]? _lookup;
+
+        /// <summary>Conserve memory if pattern is ascii-only</summary>
         private readonly bool _isAsciiOnly;
 
+        /// <summary>
+        /// fallback lookup if over 255 minterms
+        /// this is almost never used
+        /// </summary>
+        private readonly int[]? _intLookup;
+
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
         public MintermClassifier(BDD[] minterms)
@@ -39,8 +47,9 @@ public MintermClassifier(BDD[] minterms)
                 return;
             }
 
-            // low memory variant could create an ascii-only array
-            // cheaper to iterate twice than allocate an array and potentially not use it
+            // low memory variant is to create an ascii-only array
+            // this adds indirection to the hot loop which costs performance
+            // and only exists because the wasm tests fail with OOM
             _isAsciiOnly = true;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
@@ -51,25 +60,44 @@ public MintermClassifier(BDD[] minterms)
                 }
             }
 
-
             // assign minterm category for every char
             // unused characters in minterm 0 get mapped to zero
-            int[] lookup = new int[_isAsciiOnly ? 128 : 65536];
-            for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+            if (minterms.Length > 255)
             {
-                // precompute all assigned minterm categories
-                (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                foreach ((uint start, uint end) in mintermRanges)
+                // over 255 unique sets also means it's never ascii only
+                int[] lookup = new int[ushort.MaxValue + 1];
+                for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
-                    // assign character ranges in bulk
-                    Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                    slice.Fill(mintermId);
+                    // precompute all assigned minterm categories
+                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                    foreach ((uint start, uint end) in mintermRanges)
+                    {
+                        // assign character ranges in bulk
+                        Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
+                        slice.Fill(mintermId);
+                    }
                 }
+                _intLookup = lookup;
+            }
+            else
+            {
+                byte[] lookup = new byte[_isAsciiOnly ? 128 : ushort.MaxValue + 1];
+                for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+                {
+                    // precompute all assigned minterm categories
+                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                    foreach ((uint start, uint end) in mintermRanges)
+                    {
+                        // assign character ranges in bulk
+                        Span<byte> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
+                        slice.Fill((byte)mintermId);
+                    }
+                }
+                _lookup = lookup;
             }
-            _lookup = lookup;
         }
 
-        /// <summary>Gets the ID of the minterm associated with the specified character.</summary>
+        // /// <summary>Gets the ID of the minterm associated with the specified character.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
@@ -79,10 +107,19 @@ public int GetMintermID(int c)
             }
             // high performance variant would use a span directly
             // but this is not possible in low memory constraints
-            return _lookup[c];
+            // additional memory is saved by using a byte
+            return _intLookup is null ? _lookup![c] : _intLookup[c];
         }
 
-        // [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        // public Span<int> LookupSpan() => _lookup.AsSpan();
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public bool IsAsciiOnly() => _isAsciiOnly;
+
+        /// <summary>
+        /// Can be null if there is over 255 minterms
+        /// </summary>
+        /// <returns></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public byte[]? ByteLookup() => _lookup;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index e08bd315819f1..298012032e9b5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -511,11 +511,31 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                     break;
                 }
 
-                bool done = currentState.NfaState is not null ?
-                    FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
-                    // If there are no edge cases then use the quicker loop
-                    _findOpts is null && !_containsEndZAnchor ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
-                    FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
+                bool done;
+                if (currentState.NfaState is not null)
+                    // nfa fallback check
+                    done = FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler,
+                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
+                            ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
+                else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null)
+                {
+                    done = _mintermClassifier.IsAsciiOnly()
+                        ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1,
+                            mode, ref pos,
+                            currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
+                            ref initialStatePosCandidate)
+                        // if there are no edge cases then use the quicker loop
+                        : FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos,
+                        currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
+                        ref initialStatePosCandidate);
+                }
+                else
+                {
+                    // dfa loop with potential skipping
+                    done = FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
+                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
+                            ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
+                }
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
@@ -551,6 +571,85 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
             return endPos;
         }
 
+        /// <summary>
+        /// Ascii-only variant of the hot loop to conserve memory
+        /// </summary>
+        private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
+                ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+        {
+            // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
+            int pos = posRef;
+            int endPos = endPosRef;
+            // can only be used with full array initialized and <= 255 minterms
+            byte[] mtlookup = _mintermClassifier.ByteLookup()!;
+            int endStateId = endStateIdRef;
+            int currStateId = startStateId;
+            try
+            {
+                // Loop through each character in the input, transitioning from state to state for each.
+                while (true)
+                {
+                    if (currStateId == _deadStateId)
+                    {
+                        return true;
+                    }
+
+                    int c = input[pos];
+                    int positionId = c >= 128 ? 0 : mtlookup[c];
+
+                    // If the state is nullable for the next character, meaning it accepts the empty string,
+                    // we found a potential end state.
+                    if (_canBeNullableArray[currStateId])
+                    {
+                        if (_stateFlagsArray[currStateId].IsNullable()
+                            || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId)))
+                        {
+                            endPos = pos;
+                            endStateId = currStateId;
+                            // A match is known to exist.  If that's all we need to know, we're done.
+                            if (mode == RegexRunnerMode.ExistenceRequired)
+                            {
+                                return true;
+                            }
+                        }
+                    }
+
+                    // If there is more input available try to transition with the next character.
+                    // Note: the order here is important so the transition gets taken
+                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)|| pos >= lengthMinus1)
+                    {
+                        pos++;
+                        if (pos < input.Length)
+                        {
+                            return false;
+                        }
+                        // one off check for the final position
+                        // this is just to move it out of the hot loop
+                        if ((!_stateFlagsArray[currStateId].IsNullable() &&
+                             !_stateArray[currStateId]!.IsNullableFor(
+                                 GetPositionKind(-1))))
+                        {
+                            return false;
+                        }
+                        // the end position (-1) was nullable
+                        endPos = pos;
+                        endStateId = currStateId;
+                        return mode == RegexRunnerMode.ExistenceRequired;
+                    }
+
+                    // We successfully transitioned, so update our current input index to match.
+                    pos++;
+                }
+            }
+            finally
+            {
+                // Write back the local copies of the ref values.
+                posRef = pos;
+                endPosRef = endPos;
+                endStateIdRef = endStateId;
+                initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
+            }
+        }
 
         /// <summary>
         /// TODO: this is essentially a stripped down version when there's no good prefix optimizations
@@ -563,8 +662,8 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            // can only be used with full array
-            // Span<int> mtlookup = _mintermClassifier.Lookup.AsSpan();
+            // can only be used with full array initialized and <= 255 minterms
+            byte[] mtlookup = _mintermClassifier.ByteLookup()!;
             int endStateId = endStateIdRef;
             int currStateId = startStateId;
             try
@@ -576,14 +675,12 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                     {
                         return true;
                     }
-                    // int positionId = mtlookup[input[pos]];
-                    int positionId = _mintermClassifier.GetMintermID(input[pos]);
-
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
                     if (_canBeNullableArray[currStateId])
                     {
-                        if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId)))
+                        if (_stateFlagsArray[currStateId].IsNullable()
+                            || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]])))
                         {
                             endPos = pos;
                             endStateId = currStateId;
@@ -597,7 +694,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
 
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition gets taken
-                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= lengthMinus1)
+                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]])|| pos >= lengthMinus1)
                     {
                         pos++;
                         if (pos < input.Length)

From 868e02d955f200d892f2360c99be1cc700471719 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 29 May 2024 19:43:20 +0300
Subject: [PATCH 12/63] cheaper nullability checks

---
 .../Symbolic/MatchingState.cs                 | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 9624e0fd143bd..4094dfec19ccb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -18,14 +18,23 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
             // than to pay for it on every call
             if (Node.CanBeNullable)
             {
-                _nullabilityLookup = new bool[5];
-                for (uint nk = 0; nk <= 4; nk++)
+                for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
                 {
-                    _nullabilityLookup[nk] = IsNullableForInit(nk);
+                    _nullabilityLookup |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0);
                 }
             }
         }
-        private readonly bool[]? _nullabilityLookup;
+        /// <summary>
+        /// todo: change this to flags later
+        /// nullability for each context encoded in a bit
+        /// 0 means node cannot be nullable
+        /// 00001 -> nullable for General
+        /// 00010 -> nullable for BeginningEnd
+        /// 00100 -> nullable for NewLine
+        /// 01000 -> nullable for NewLineS
+        /// 10000 -> nullable for WordLetter
+        /// </summary>
+        private readonly byte _nullabilityLookup; // redundant but added for clarity
 
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
@@ -106,12 +115,18 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
             return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
         }
 
+        /// <summary>
+        /// Bit encoded nullability check for the hot loop
+        /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
         {
-            return (_nullabilityLookup is not null && _nullabilityLookup[nextCharKind]);
+            return (nextCharKind & _nullabilityLookup) > 0;
         }
 
+        /// <summary>
+        /// Full nullability check for initialization
+        /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableForInit(uint nextCharKind)
         {

From 14afd188bab372ccf2ba9f219213962806642206 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 29 May 2024 19:51:22 +0300
Subject: [PATCH 13/63] nullability encoding

---
 .../Text/RegularExpressions/Symbolic/MatchingState.cs      | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 4094dfec19ccb..13c8f3900bb02 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -26,6 +26,9 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         }
         /// <summary>
         /// todo: change this to flags later
+        /// i think the clr assigns an entire class field for this
+        /// so this should be placed in an array as well
+        /// --
         /// nullability for each context encoded in a bit
         /// 0 means node cannot be nullable
         /// 00001 -> nullable for General
@@ -34,7 +37,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         /// 01000 -> nullable for NewLineS
         /// 10000 -> nullable for WordLetter
         /// </summary>
-        private readonly byte _nullabilityLookup; // redundant but added for clarity
+        private readonly byte _nullabilityLookup;
 
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
@@ -121,7 +124,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
         {
-            return (nextCharKind & _nullabilityLookup) > 0;
+            return ((nextCharKind + 1) & _nullabilityLookup) > 0;
         }
 
         /// <summary>

From 5f5ab5523d880086382eeb0edc5e7d42399f8fa1 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 29 May 2024 21:44:17 +0300
Subject: [PATCH 14/63] nullability cached as bytes

---
 .../src/System.Text.RegularExpressions.csproj |  2 +-
 .../Symbolic/MatchingState.cs                 | 62 ++++++++++++-------
 .../Symbolic/MintermClassifier.cs             | 17 ++---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 24 ++++++-
 .../Symbolic/SymbolicRegexMatcher.cs          | 52 ++++++++--------
 5 files changed, 97 insertions(+), 60 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index a6f7119d2fd2f..6fbc17722a774 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,7 +5,7 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-<!--    <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060</NoWarn>-->
+<!--    <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649</NoWarn>-->
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 13c8f3900bb02..941a5f76ea27e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -14,30 +14,17 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         {
             Node = node;
             PrevCharKind = prevCharKind;
-            // this is significantly cheaper to initialize once
-            // than to pay for it on every call
-            if (Node.CanBeNullable)
-            {
-                for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
-                {
-                    _nullabilityLookup |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0);
-                }
-            }
+            NullabilityInfo = BuildNullabilityInfo();
         }
+
         /// <summary>
-        /// todo: change this to flags later
-        /// i think the clr assigns an entire class field for this
-        /// so this should be placed in an array as well
-        /// --
-        /// nullability for each context encoded in a bit
-        /// 0 means node cannot be nullable
-        /// 00001 -> nullable for General
-        /// 00010 -> nullable for BeginningEnd
-        /// 00100 -> nullable for NewLine
-        /// 01000 -> nullable for NewLineS
-        /// 10000 -> nullable for WordLetter
+        /// TODO: The CLR assigns an entire field for this byte which is a waste,
+        /// and the much more preferred way to use this is in _nullabilityArray in the matcher
+        /// but the current design relies on interfaces/flags and
+        /// using the MatchingState directly so this byte is a quick solution to cheapen
+        /// it there by ~30% as well without having to breaking it all to pieces
         /// </summary>
-        private readonly byte _nullabilityLookup;
+        internal readonly int NullabilityInfo;
 
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
@@ -119,12 +106,15 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
         }
 
         /// <summary>
-        /// Bit encoded nullability check for the hot loop
+        /// TODO: This method should really never be used and
+        /// is only used to speed up the existing architecture.
+        /// Use <see cref="SymbolicRegexMatcher{TSet}.IsNullableWithContext"/>
+        /// whereever possible
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
         {
-            return ((nextCharKind + 1) & _nullabilityLookup) > 0;
+            return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
         }
 
         /// <summary>
@@ -168,6 +158,32 @@ internal StateFlags BuildStateFlags(bool isInitial)
             return info;
         }
 
+        /// <summary>
+        /// nullability for each context is encoded in a bit
+        /// 0 means node cannot be nullable
+        /// 00001 -> nullable for General
+        /// 00010 -> nullable for BeginningEnd
+        /// 00100 -> nullable for NewLine
+        /// 01000 -> nullable for NewLineS
+        /// 10000 -> nullable for WordLetter
+        /// todo: change to flags later
+        /// </summary>
+        /// <returns></returns>
+        internal byte BuildNullabilityInfo()
+        {
+            byte nullabilityInfo = 0;
+            // this is significantly cheaper to initialize once
+            // than to pay for it on every call
+            if (Node.CanBeNullable)
+            {
+                for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
+                {
+                    nullabilityInfo |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0);
+                }
+            }
+            return nullabilityInfo;
+        }
+
         public override bool Equals(object? obj) =>
             obj is MatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index a2ea8dcaeb904..1f852d16cee2c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -47,8 +47,8 @@ public MintermClassifier(BDD[] minterms)
                 return;
             }
 
-            // low memory variant is to create an ascii-only array
-            // this adds indirection to the hot loop which costs performance
+            // low memory compromise is to create an ascii-only array
+            // int mintermId = c >= 128 ? 0 : mtlookup[c];
             // and only exists because the wasm tests fail with OOM
             _isAsciiOnly = true;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
@@ -97,7 +97,7 @@ public MintermClassifier(BDD[] minterms)
             }
         }
 
-        // /// <summary>Gets the ID of the minterm associated with the specified character.</summary>
+        /// <summary>Gets the ID of the minterm associated with the specified character. </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
@@ -105,18 +105,21 @@ public int GetMintermID(int c)
             {
                 return 0;
             }
-            // high performance variant would use a span directly
-            // but this is not possible in low memory constraints
+            // high performance variant would use a span directly.
             // additional memory is saved by using a byte
             return _intLookup is null ? _lookup![c] : _intLookup[c];
         }
 
-
+        /// <summary>
+        /// Whether to use the low memory ascii-only hot loop or the full loop
+        /// </summary>
+        /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public bool IsAsciiOnly() => _isAsciiOnly;
 
         /// <summary>
-        /// Can be null if there is over 255 minterms
+        /// Quick mapping from char to minterm,
+        /// can be null if there is over 255 minterms
         /// </summary>
         /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index a6f86e09ffacf..60e8712298bfb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -42,9 +42,17 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         private StateFlags[] _stateFlagsArray;
 
         /// <summary>
+        /// important: the pattern must not contain endZ for this to be valid.
         /// Used to short-circuit nullability in the hot loop
+        /// nullability for each context is encoded in a bit
+        /// 0 means node cannot be nullable
+        /// 00001 -> nullable for General
+        /// 00010 -> nullable for BeginningEnd
+        /// 00100 -> nullable for NewLine
+        /// 01000 -> nullable for NewLineS
+        /// 10000 -> nullable for WordLetter
         /// </summary>
-        private bool[] _canBeNullableArray;
+        private byte[] _nullabilityArray;
 
         /// <summary>
         /// Used to short-circuit accelerated states in the hot loop
@@ -127,6 +135,16 @@ private static void ArrayResizeAndVolatilePublish<T>(ref T[] array, int newSize)
 
         private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId;
 
+        /// <summary>
+        /// Pre-computed hot-loop version of nullability check
+        /// </summary>
+        /// <param name="stateId"></param>
+        /// <param name="mintermId"></param>
+        /// <returns></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private bool IsNullableWithContext(int stateId, int mintermId) =>
+            ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0;
+
         /// <summary>Returns the span from <see cref="_dfaDelta"/> that may contain transitions for the given state</summary>
         private Span<int> GetDeltasFor(MatchingState<TSet> state)
         {
@@ -268,12 +286,12 @@ private MatchingState<TSet> GetOrCreateState_NoLock(SymbolicRegexNode<TSet> node
                     ArrayResizeAndVolatilePublish(ref _stateArray, newsize);
                     ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog);
                     ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize);
-                    ArrayResizeAndVolatilePublish(ref _canBeNullableArray, newsize);
+                    ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize);
                     ArrayResizeAndVolatilePublish(ref _canBeAcceleratedArray, newsize);
                 }
                 _stateArray[state.Id] = state;
                 _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState);
-                _canBeNullableArray[state.Id] = _stateFlagsArray[state.Id].CanBeNullable();
+                _nullabilityArray[state.Id] = state.BuildNullabilityInfo();
                 _canBeAcceleratedArray[state.Id] = _stateFlagsArray[state.Id].IsAccelerated();
             }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 298012032e9b5..343256513bdcd 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -81,13 +81,13 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
         private readonly RegexFindOptimizations? _findOpts;
 
-        /// <summary>TODO: summarize</summary>
+        /// <summary>Dead end state to quickly return NoMatch</summary>
         private readonly int _deadStateId;
 
-        /// <summary>TODO: summarize</summary>
+        /// <summary>Whether the pattern contains any anchor</summary>
         private readonly bool _containsAnyAnchor;
 
-        /// <summary>TODO: summarize</summary>
+        /// <summary>Whether the pattern contains the EndZ anchor which makes most optimizations invalid</summary>
         private readonly bool _containsEndZAnchor;
 
         /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
@@ -184,7 +184,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             // Initialization for fields in SymbolicRegexMatcher.Automata.cs
             _stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
             _stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
-            _canBeNullableArray = new bool[InitialDfaStateCapacity];
+            _nullabilityArray = new byte[InitialDfaStateCapacity];
             _canBeAcceleratedArray = new bool[InitialDfaStateCapacity];
             _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog];
 
@@ -519,13 +519,15 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                             ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
                 else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null)
                 {
-                    done = _mintermClassifier.IsAsciiOnly()
+                    done =
+                        _mintermClassifier.IsAsciiOnly()
                         ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1,
                             mode, ref pos,
                             currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
                             ref initialStatePosCandidate)
                         // if there are no edge cases then use the quicker loop
-                        : FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos,
+                        :
+                        FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos,
                         currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
                         ref initialStatePosCandidate);
                 }
@@ -572,7 +574,9 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
         }
 
         /// <summary>
-        /// Ascii-only variant of the hot loop to conserve memory
+        /// Ascii-only variant of the hot loop to conserve memory.
+        /// Only major difference is the minterm lookup:
+        /// `int positionId = c >= 128 ? 0 : mtlookup[c]`;
         /// </summary>
         private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
                 ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
@@ -597,12 +601,10 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                     int c = input[pos];
                     int positionId = c >= 128 ? 0 : mtlookup[c];
 
-                    // If the state is nullable for the next character, meaning it accepts the empty string,
-                    // we found a potential end state.
-                    if (_canBeNullableArray[currStateId])
+                    // If the state is nullable for the next character we found a potential end state.
+                    // note: the double array lookup is important here, storing a local variable is expensive
+                    if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, positionId))
                     {
-                        if (_stateFlagsArray[currStateId].IsNullable()
-                            || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId)))
                         {
                             endPos = pos;
                             endStateId = currStateId;
@@ -675,12 +677,10 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                     {
                         return true;
                     }
-                    // If the state is nullable for the next character, meaning it accepts the empty string,
-                    // we found a potential end state.
-                    if (_canBeNullableArray[currStateId])
+                    // If the state is nullable for the next character, we found a potential end state.
+                    // note: the double array lookup is important here, storing a local variable is expensive
+                    if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]]))
                     {
-                        if (_stateFlagsArray[currStateId].IsNullable()
-                            || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]])))
                         {
                             endPos = pos;
                             endStateId = currStateId;
@@ -770,10 +770,6 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     {
                         return true;
                     }
-
-                    // Check if currentState represents an initial state. If it does, call into any possible find optimizations
-                    // to hopefully more quickly find the next possible starting location.
-                    // if (flags.IsAccelerated())
                     if (_canBeAcceleratedArray[state.DfaStateId])
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
@@ -787,7 +783,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
-                    if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         endPos = pos;
                         endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
@@ -859,6 +855,9 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
                 {
                     StateFlags flags = TStateHandler.GetStateFlags(this, in state);
 
+                    // TFindOptimizationsHandler is redundant here as
+                    // going into NFA mode signals something already exploded
+
                     // Dead end here means the set is empty
                     if (state.NfaState!.NfaStateSet.Count == 0)
                     {
@@ -934,7 +933,7 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<ch
                 i -= _optimizedReversalState.Item1;
                 currentState = new CurrentState(_optimizedReversalState.Item2);
                 // anchor variant may need context to be computed if nullable
-                if (_containsAnyAnchor && _canBeNullableArray[currentState.DfaStateId])
+                if (_containsAnyAnchor && _nullabilityArray[currentState.DfaStateId] > 0)
                 {
                     if (TNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
                             in currentState, TInputReader.GetPositionId(this, input, i),
@@ -996,7 +995,7 @@ private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilit
                     int positionId = TInputReader.GetPositionId(this, input, pos - 1);
                     // If the state accepts the empty string, we found a valid starting position.  Record it and keep going,
                     // since we're looking for the earliest one to occur within bounds.
-                    if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
+                    if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
                             TStateHandler.GetStateFlags(this, in state)))
                     {
                         lastStart = pos;
@@ -1341,7 +1340,8 @@ private interface IStateHandler
             public static bool StartsWithLineAnchor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor;
 
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind);
+            public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state,
+                uint nextCharKind) => matcher._nullabilityArray[state.DfaStateId] > 0 && ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0;
 
             /// <summary>Gets the preferred DFA state for nullability. In DFA mode this is just the state itself.</summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -1708,7 +1708,7 @@ public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matche
                 where TStateHandler : struct, IStateHandler
             {
                 Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor);
-                return flags.IsNullable();
+                return matcher.IsNullableWithContext(state.DfaStateId, positionId);
             }
         }
 

From dd121de495123fcfb7cc1424532873287d2169fe Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 30 May 2024 22:16:52 +0300
Subject: [PATCH 15/63] reverting some changes

---
 .../Symbolic/MatchingState.cs                  | 10 ++--------
 .../Symbolic/MintermClassifier.cs              |  7 +++----
 .../Symbolic/RegexNodeConverter.cs             |  2 +-
 .../Symbolic/SymbolicRegexMatcher.cs           | 18 ++++++++++++++++--
 .../Symbolic/SymbolicRegexThresholds.cs        |  4 ++--
 5 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 941a5f76ea27e..5d9c66dbb07e0 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -18,11 +18,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         }
 
         /// <summary>
-        /// TODO: The CLR assigns an entire field for this byte which is a waste,
-        /// and the much more preferred way to use this is in _nullabilityArray in the matcher
-        /// but the current design relies on interfaces/flags and
-        /// using the MatchingState directly so this byte is a quick solution to cheapen
-        /// it there by ~30% as well without having to breaking it all to pieces
+        /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor
         /// </summary>
         internal readonly int NullabilityInfo;
 
@@ -106,8 +102,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
         }
 
         /// <summary>
-        /// TODO: This method should really never be used and
-        /// is only used to speed up the existing architecture.
+        /// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned
         /// Use <see cref="SymbolicRegexMatcher{TSet}.IsNullableWithContext"/>
         /// whereever possible
         /// </summary>
@@ -166,7 +161,6 @@ internal StateFlags BuildStateFlags(bool isInitial)
         /// 00100 -> nullable for NewLine
         /// 01000 -> nullable for NewLineS
         /// 10000 -> nullable for WordLetter
-        /// todo: change to flags later
         /// </summary>
         /// <returns></returns>
         internal byte BuildNullabilityInfo()
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 1f852d16cee2c..83cd14daf30d2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -47,9 +47,8 @@ public MintermClassifier(BDD[] minterms)
                 return;
             }
 
-            // low memory compromise is to create an ascii-only array
+            // ascii-only array to save memory
             // int mintermId = c >= 128 ? 0 : mtlookup[c];
-            // and only exists because the wasm tests fail with OOM
             _isAsciiOnly = true;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
@@ -60,8 +59,8 @@ public MintermClassifier(BDD[] minterms)
                 }
             }
 
-            // assign minterm category for every char
-            // unused characters in minterm 0 get mapped to zero
+            // i have never seen a regex use over 80 minterms not to speak of 255,
+            // but it's there as a fallback mechanism
             if (minterms.Length > 255)
             {
                 // over 255 unique sets also means it's never ascii only
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
index 88fc386b6956e..31f01271d558b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
@@ -532,7 +532,7 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
         // /// <summary>
         // /// attempt to remove anchors when possible since it reduces overhead
         // /// more rewrites could be tried but it's important to preserve PCRE semantics
-        // /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner
+        // /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner
         // /// </summary>
         // /// <param name="builder"></param>
         // /// <param name="rootNode"></param>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 343256513bdcd..258c0ab72ae46 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -81,7 +81,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
         private readonly RegexFindOptimizations? _findOpts;
 
-        /// <summary>Dead end state to quickly return NoMatch</summary>
+        /// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
         private readonly int _deadStateId;
 
         /// <summary>Whether the pattern contains any anchor</summary>
@@ -102,6 +102,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _reverseInitialStates;
 
+        /// <summary>
+        /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state.
+        /// </summary>
         private readonly (int, MatchingState<TSet>) _optimizedReversalState;
 
         /// <summary>Partition of the input space of sets.</summary>
@@ -328,6 +331,8 @@ uint CalculateMintermIdKind(int mintermId)
         /// </summary>
         internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);
 
+        /// TODO: when you're calling a function millions of times per second even this add 1 does cost something
+        /// this should be ideally remapped
         /// <summary>Look up what is the character kind given a position ID</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];
@@ -351,6 +356,7 @@ internal TSet GetMintermFromId(int mintermId)
             return minterms[mintermId];
         }
 
+        /// <summary>TODO: this if-else branch could be called once. it's currently causing overhead on every single step</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private uint GetCharKind<TInputReader>(ReadOnlySpan<char> input, int i)
             where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ?
@@ -657,6 +663,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
         /// TODO: this is essentially a stripped down version when there's no good prefix optimizations
         /// i don't trust the compiler to optimize this and it makes a
         /// ~50% difference in performance with removing unnecessary checks alone
+        ///
         /// </summary>
         private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
                 ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
@@ -668,9 +675,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
             int endStateId = endStateIdRef;
             int currStateId = startStateId;
+            // ldfld only once
+            // int deadStateId = _deadStateId;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
+                // The goal is to make this loop as fast as it can possible be,
+                // every single piece of overhead should be removed here
+                // there should be not a single callvirt instruction in the loop
+                // ldfld only if necessary (e.g. a reference changes)
+                // no memory writes unless necessary
                 while (true)
                 {
                     if (currStateId == _deadStateId)
@@ -783,7 +797,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
-                    if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         endPos = pos;
                         endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index b00f2631c3aa2..d455f26da1dcf 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -26,14 +26,14 @@ internal static class SymbolicRegexThresholds
         /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s
         /// with an entirely different search-time algorithmic complexity
         /// 100_000 isn't a really a high memory cost either,
-        /// i'd even put 1_000_000 on the table but that might push it for general purpose use
+        /// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has
         /// </remarks>
         internal const int NfaThreshold = 100_000;
 
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
         /// after the AST has been anlayzed for safe handling.
-        /// TODO: this is perhaps too conservative, consider raising this
+        /// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too
         /// <remarks>
         /// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
         /// This default value may be overridden with the AppContext data

From 723c5b61e9814a352c39e50edc3b8842da428b2f Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 6 Jun 2024 02:15:14 +0300
Subject: [PATCH 16/63] testing nfa fallback

---
 .../Symbolic/SymbolicRegexMatcher.cs          | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 258c0ab72ae46..15c1ae489c6aa 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -503,11 +503,12 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                     pos + CharsPerTimeoutCheck :
                     input.Length;
 
+                // one-off check for input end
                 if (pos == input.Length && currentState.NfaState is null)
                 {
-                    if ((!_stateFlagsArray[currentState.DfaStateId].IsNullable() &&
-                         !_stateArray[currentState.DfaStateId]!.IsNullableFor(
-                             GetPositionKind(-1))))
+                    if (!(_stateFlagsArray[currentState.DfaStateId].IsNullable() ||
+                            _stateArray[currentState.DfaStateId]!.IsNullableFor(
+                                GetPositionKind(-1))))
                     {
                         break;
                     }
@@ -517,6 +518,7 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                     break;
                 }
 
+
                 bool done;
                 if (currentState.NfaState is not null)
                     // nfa fallback check
@@ -544,7 +546,6 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                             TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
                             ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
                 }
-
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
                 if (done || pos >= input.Length)
@@ -626,15 +627,15 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                     // Note: the order here is important so the transition gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)|| pos >= lengthMinus1)
                     {
-                        pos++;
-                        if (pos < input.Length)
+                        if (pos + 1 < input.Length)
                         {
                             return false;
                         }
+                        pos++;
                         // one off check for the final position
                         // this is just to move it out of the hot loop
-                        if ((!_stateFlagsArray[currStateId].IsNullable() &&
-                             !_stateArray[currStateId]!.IsNullableFor(
+                        if (!(_stateFlagsArray[currStateId].IsNullable() ||
+                             _stateArray[currStateId]!.IsNullableFor(
                                  GetPositionKind(-1))))
                         {
                             return false;
@@ -710,15 +711,15 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                     // Note: the order here is important so the transition gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]])|| pos >= lengthMinus1)
                     {
-                        pos++;
-                        if (pos < input.Length)
+                        if (pos + 1 < input.Length)
                         {
                             return false;
                         }
+                        pos++;
                         // one off check for the final position
                         // this is just to move it out of the hot loop
-                        if ((!_stateFlagsArray[currStateId].IsNullable() &&
-                             !_stateArray[currStateId]!.IsNullableFor(
+                        if (!(_stateFlagsArray[currStateId].IsNullable() ||
+                             _stateArray[currStateId]!.IsNullableFor(
                                  GetPositionKind(-1))))
                         {
                             return false;
@@ -1722,7 +1723,7 @@ public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matche
                 where TStateHandler : struct, IStateHandler
             {
                 Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor);
-                return matcher.IsNullableWithContext(state.DfaStateId, positionId);
+                return flags.IsNullable();
             }
         }
 
@@ -1736,6 +1737,8 @@ public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matche
                 where TStateHandler : struct, IStateHandler
             {
                 return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
+                // cannot be used in NFA mode
+                // return matcher.IsNullableWithContext(state.DfaStateId, positionId);
             }
         }
     }

From 6bf4095ad83a9e3c2be7f55c1c55744b56211bc9 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 17 Jun 2024 23:55:54 +0300
Subject: [PATCH 17/63] refactoring, work in progress

---
 .../src/System.Text.RegularExpressions.csproj |   6 +-
 .../Symbolic/MatchReversal.cs                 |  17 +
 .../Symbolic/MatchReversalKind.cs             |  14 +
 .../Symbolic/MatchingState.cs                 |   2 -
 .../Symbolic/MintermClassifier.cs             |  31 +-
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  37 ++-
 .../Symbolic/SymbolicRegexMatcher.cs          | 312 +++++++++++-------
 7 files changed, 269 insertions(+), 150 deletions(-)
 create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
 create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 6fbc17722a774..0d952017013c0 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,7 +5,9 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-<!--    <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649</NoWarn>-->
+   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>
+<!--   documentation warnings -->
+   <NoWarn>$(NoWarn);CS1574</NoWarn>
   </PropertyGroup>
 
   <ItemGroup>
@@ -94,6 +96,8 @@
     <Compile Include="System\Text\RegularExpressions\Symbolic\UnicodeCategoryConditions.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\UnicodeCategoryRanges.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\UnicodeCategoryRangesGenerator.cs" />
+    <Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversalKind.cs"/>
+    <Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversal.cs"/>
     <!-- Common or Common-branched source files -->
     <Compile Include="$(CommonPath)System\HexConverter.cs" Link="Common\System\HexConverter.cs" />
     <Compile Include="$(CommonPath)System\Obsoletions.cs" Link="Common\System\Obsoletions.cs" />
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
new file mode 100644
index 0000000000000..b7be92195ee58
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
@@ -0,0 +1,17 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Text.RegularExpressions.Symbolic;
+
+internal sealed class MatchReversal<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
+{
+    public MatchReversal(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
+    {
+        Kind = kind;
+        FixedLength = fixedLength;
+        AdjustedStartState = adjustedStartState;
+    }
+    internal MatchReversalKind Kind { get; }
+    internal int FixedLength { get; }
+    internal MatchingState<TSet>? AdjustedStartState { get; }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
new file mode 100644
index 0000000000000..d498e4dd7eb99
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
@@ -0,0 +1,14 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Text.RegularExpressions.Symbolic;
+
+internal enum MatchReversalKind
+{
+    /// <summary>The most generic option, run the regex backwards to find beginning of match</summary>
+    MatchStart,
+    /// <summary>Part of the reversal is fixed length and can be skipped</summary>
+    PartialFixedLength,
+    /// <summary>The entire pattern is fixed length, reversal not necessary</summary>
+    FixedLength
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 5d9c66dbb07e0..5bd2baf668d3d 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -166,8 +166,6 @@ internal StateFlags BuildStateFlags(bool isInitial)
         internal byte BuildNullabilityInfo()
         {
             byte nullabilityInfo = 0;
-            // this is significantly cheaper to initialize once
-            // than to pay for it on every call
             if (Node.CanBeNullable)
             {
                 for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 83cd14daf30d2..d3a0933c18433 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -49,6 +49,7 @@ public MintermClassifier(BDD[] minterms)
 
             // ascii-only array to save memory
             // int mintermId = c >= 128 ? 0 : mtlookup[c];
+            // _isAsciiOnly = true;
             _isAsciiOnly = true;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
@@ -63,20 +64,24 @@ public MintermClassifier(BDD[] minterms)
             // but it's there as a fallback mechanism
             if (minterms.Length > 255)
             {
+                // WIP: temporary exception to see if any tests in the pipeline reach this
+                // if nothing reaches this perhaps it'd be easier to just throw an exception
+                // during construction
+                throw new Exception($"reached over 255 minterms, count {minterms}");
                 // over 255 unique sets also means it's never ascii only
-                int[] lookup = new int[ushort.MaxValue + 1];
-                for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
-                {
-                    // precompute all assigned minterm categories
-                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                    foreach ((uint start, uint end) in mintermRanges)
-                    {
-                        // assign character ranges in bulk
-                        Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                        slice.Fill(mintermId);
-                    }
-                }
-                _intLookup = lookup;
+                // int[] lookup = new int[ushort.MaxValue + 1];
+                // for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+                // {
+                //     // precompute all assigned minterm categories
+                //     (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                //     foreach ((uint start, uint end) in mintermRanges)
+                //     {
+                //         // assign character ranges in bulk
+                //         Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
+                //         slice.Fill(mintermId);
+                //     }
+                // }
+                // _intLookup = lookup;
             }
             else
             {
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 60e8712298bfb..1ef89b006fef0 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -60,13 +60,13 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         private bool[] _canBeAcceleratedArray;
 
 #if DEBUG
-        // private readonly Action<string> _wout = st =>
-        // {
-        //     var a_cons = System.Reflection.Assembly.Load("System.Console");
-        //     var t_cons = a_cons.GetType("System.Console")!;
-        //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
-        //     wl!.Invoke(null, [st]);
-        // };
+        private readonly Action<string> _wout = st =>
+        {
+            var a_cons = System.Reflection.Assembly.Load("System.Console");
+            var t_cons = a_cons.GetType("System.Console")!;
+            var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
+            wl!.Invoke(null, [st]);
+        };
 #endif
         /// <summary>
         /// The transition function for DFA mode.
@@ -199,7 +199,7 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
         /// </summary>
         /// <param name="node">reversed initial pattern</param>
         /// <returns>returns n of chars to skip and adjusted reversal start state</returns>
-        private (int, MatchingState<TSet>) CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
+        private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
         {
             int pos = 0;
             SymbolicRegexNode<TSet>? current = node;
@@ -240,10 +240,15 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
             while (canLoop)
             {
 #if DEBUG
-                // _wout($"{pos} {current._kind} l:{current._left!._kind} {current}");
+                // if (current._left is null)
+                //     _wout($"NULL {current._kind}");
+                // else
+                //     _wout($"{pos} {current._kind} l:{current._left!._kind} {current}");
 #endif
                 (bool loop, SymbolicRegexNode<TSet> next) = current switch
                 {
+                    // if this is reached then entire match is fixed length
+                    { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon),
                     {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} =>
                         (true, current._right!),
                     {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } =>
@@ -257,7 +262,17 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
                 canLoop = loop;
                 current = next;
             }
-            return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0));
+
+            MatchReversal<TSet> reversal =
+                (pos, current) switch
+                {
+                    { pos: > 0 } when current == _builder.Epsilon => new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos),
+                    { pos: > 0 } => new MatchReversal<TSet>(MatchReversalKind.PartialFixedLength, pos,
+                        GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)),
+                    _ => new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0)
+                };
+
+            return reversal;
         }
 
         /// <summary>
@@ -424,7 +439,7 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse
                     MatchingState<TSet> coreState = GetState(coreId);
                     TSet minterm = GetMintermFromId(mintermId);
                     uint nextCharKind = GetPositionKind(mintermId);
-                    SymbolicRegexNode<TSet>? targetNode = coreTargetId > 0 ?
+                    SymbolicRegexNode<TSet> targetNode = coreTargetId > 0 ?
                         GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind);
 
                     List<int> targetsList = new();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 15c1ae489c6aa..30fc1be98abdf 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -105,7 +105,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>
         /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state.
         /// </summary>
-        private readonly (int, MatchingState<TSet>) _optimizedReversalState;
+        private readonly MatchReversal<TSet> _optimizedReversalState;
 
         /// <summary>Partition of the input space of sets.</summary>
         private readonly TSet[] _minterms;
@@ -159,11 +159,11 @@ public static SymbolicRegexMatcher<TSet> Create(
 
             // Convert the BDD-based AST to TSet-based AST
             SymbolicRegexNode<TSet> rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver));
-            return new SymbolicRegexMatcher<TSet>(builder, rootNode, captureCount, findOptimizations, matchTimeout);
+            return new SymbolicRegexMatcher<TSet>(bddBuilder, builder, rootNode, captureCount, findOptimizations, matchTimeout);
         }
 
         /// <summary>Constructs matcher for given symbolic regex.</summary>
-        private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
+        private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> bddBuilder, SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
         {
             Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}");
 
@@ -200,23 +200,34 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
                 _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId);
             }
 
+            // Create optimized reversal
+            _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder));
+
             // Store the find optimizations that can be used to jump ahead to the next possible starting location.
             // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's
             // handling for beginning anchors.
             if (findOptimizations.IsUseful &&
                 findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning)
             {
+                // this makes some assumptions about the frequency of occurrences
+                // some large sets like \p{Sm} are faster with infrequent matches but slower with frequent matches
+                // the easiest thing to do here is to leave it as-is, but this means some inputs can have large performance losses of 10x or more
+
                 var setIsTooCommon = new Func<RegexFindOptimizations.FixedDistanceSet, bool>((fds) =>
                 {
-                    // _wout($"s{fds.Set}");
-                    // _wout($"c{fds.Chars.AsSpan()}");
+                    // _wout($"rn{fds.Range is null}");
+                    // _wout($"cn{fds.Chars is null}");
+                    // _wout($"cc{fds.Chars!.Length}");
                     return fds switch
                     {
-                        // anything above 4 uint16 chars is generally slower than DFA
-                        { Chars: not null } => fds.Chars.Length > 4,
+                        { Chars: not null } =>
+                            // anything above 4 uint16 chars is generally slower than DFA
+                            fds.Negated ||
+                            (fds.Chars.Length > 4 &&
+                            Array.Exists(fds.Chars, char.IsAsciiLetterLower)),
                         { Range: not null } => false,
-                        { Set: not null } => true,
-                        _ => false
+                        _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength,
+                            // false
                     };
                 });
                 // a DFA is sometimes 10x-100x faster than the optimizations
@@ -230,7 +241,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
                         findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations,
                     _ => findOptimizations // TODO: unsure which options are left here
                 };
+                // _findOpts = findOptimizations;
+                // _findOpts = null;
                 // _wout($"{findOptimizations.FindMode}");
+                // _wout($"{findOptimizations.FixedDistanceSets![0]}");
                 // _wout($"o{_findOpts}");
             }
 
@@ -282,8 +296,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             }
             _reverseInitialStates = reverseInitialStates;
 
-            // Create optimized reversal
-            _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder));
 
             // Maps a minterm ID to a character kind
             uint CalculateMintermIdKind(int mintermId)
@@ -397,17 +409,18 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
             // the position of the last b: aacaaaabbbc.  It additionally records the position of the first a after
             // the c as the low boundary for the starting position.
-            int matchStartLowBoundary, matchStartLengthMarker;
+            // int matchStartLowBoundary, matchStartLengthMarker;
+            int matchStartLowBoundary;
             int matchEnd = (_pattern._info.ContainsEndZAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
             {
-                (true, true, true) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (true, true, false) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (true, false, true) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (true, false, false) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (false, true, true) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (false, true, false) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (false, false, true) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
-                (false, false, false) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
+                (true, true, true) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (true, true, false) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (true, false, true) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (true, false, false) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (false, true, true) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (false, true, false) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (false, false, true) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
+                (false, false, false) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
             };
 
             // If there wasn't a match, we're done.
@@ -430,21 +443,61 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // exact number of positions backwards.  Continuing the previous example, phase 2 will walk backwards from
             // that last b until it finds the 4th a: aaabbbc.
             int matchStart;
-            if (matchStartLengthMarker >= 0)
+            Debug.Assert(matchEnd >= startat - 1);
+            switch (_optimizedReversalState.Kind)
             {
-                matchStart = matchEnd - matchStartLengthMarker;
-            }
-            else
-            {
-                Debug.Assert(matchEnd >= startat - 1);
-                matchStart = matchEnd < startat ?
-                    startat : (_pattern._info.ContainsEndZAnchor, _pattern._info.ContainsSomeAnchor) switch
+                case MatchReversalKind.FixedLength:
+                    matchStart = (matchEnd - _optimizedReversalState.FixedLength);
+                    break;
+                case MatchReversalKind.MatchStart:
+                case MatchReversalKind.PartialFixedLength:
+                    int initialLastStart = -1; // invalid sentinel value
+                    int i = matchEnd;
+                    uint charKind2 = GetCharKind<FullInputReader>(input, matchEnd);
+                    CurrentState reversalStartState;
+
+                    // _containsAnyAnchor
+                    if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength)
                     {
-                        (true, true) => FindStartPosition<FullInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
-                        (true, false) => FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
-                        (false, true) => FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
-                        (false, false) => FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
+                        i -= _optimizedReversalState.FixedLength;
+                        reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!);
+                        // reversal may already be nullable here in the case of anchors
+                        if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0)
+                        {
+                            if (FullNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
+                                    in reversalStartState, FullInputReader.GetPositionId(this, input, i),
+                                    DfaStateHandler.GetStateFlags(this, in reversalStartState)))
+                            {
+                                initialLastStart = i;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        reversalStartState = new CurrentState(_reverseInitialStates[charKind2]);
+                    }
+                    uint charKind = GetCharKind<FullInputReader>(input, matchEnd);
+                    var startState = new CurrentState(_reverseInitialStates[charKind]);
+                    matchStart = matchEnd < startat
+                        ? startat
+                    : (_containsEndZAnchor, _containsAnyAnchor) switch
+                    {
+                        (true, true) =>
+                            FindStartPosition<FullInputReader, FullNullabilityHandler>(
+                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                        (true, false) =>
+                            FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(
+                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                        (false, true) =>
+                            FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(
+                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                        (false, false) =>
+                            FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(
+                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
                     };
+                    break;
+                default:
+                    throw new ArgumentOutOfRangeException();
             }
 
             // Phase 3:
@@ -471,12 +524,11 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
         /// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param>
         /// <param name="mode">The mode of execution based on the regex operation being performed.</param>
         /// <param name="initialStatePos">The last position the initial state of <see cref="_dotStarredPattern"/> was visited before the end position was found.</param>
-        /// <param name="matchLength">Length of the match if there's a match; otherwise, -1.</param>
         /// <param name="perThreadData">Per thread data reused between calls.</param>
         /// <returns>
         /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists.
         /// </returns>
-        private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData)
+        private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, PerThreadData perThreadData)
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
             where TNullabilityHandler : struct, INullabilityHandler
@@ -487,7 +539,6 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
             var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<TInputReader>(input, pos - 1)]);
 
             int endPos = NoMatchExists;
-            int endStateId = -1;
 
             while (true)
             {
@@ -503,48 +554,34 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                     pos + CharsPerTimeoutCheck :
                     input.Length;
 
-                // one-off check for input end
-                if (pos == input.Length && currentState.NfaState is null)
-                {
-                    if (!(_stateFlagsArray[currentState.DfaStateId].IsNullable() ||
-                            _stateArray[currentState.DfaStateId]!.IsNullableFor(
-                                GetPositionKind(-1))))
-                    {
-                        break;
-                    }
-                    // the end position (-1) was nullable
-                    endPos = pos;
-                    endStateId = currentState.DfaStateId;
-                    break;
-                }
 
 
                 bool done;
                 if (currentState.NfaState is not null)
                     // nfa fallback check
                     done = FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
-                            ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
-                else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null)
+                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate);
+                // else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null)
+                else if (_findOpts is null && !_containsEndZAnchor)
                 {
                     done =
-                        _mintermClassifier.IsAsciiOnly()
-                        ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1,
-                            mode, ref pos,
-                            currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
-                            ref initialStatePosCandidate)
+                        // _mintermClassifier.IsAsciiOnly()
+                        // ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1,
+                        //     mode, ref pos,
+                        //     currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
+                        //     ref initialStatePosCandidate)
                         // if there are no edge cases then use the quicker loop
-                        :
-                        FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos,
-                        currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
+                        // :
+                        FindEndPositionDeltasDFANoSkip<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
+                            TNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
+                        currentState.DfaStateId, ref endPos, ref initialStatePos,
                         ref initialStatePosCandidate);
                 }
                 else
                 {
                     // dfa loop with potential skipping
                     done = FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
-                            ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
+                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate);
                 }
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
@@ -570,13 +607,9 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                 {
                     CheckTimeout(timeoutOccursAt);
                 }
-            }
 
-            // Check whether there's a fixed-length marker for the current state.  If there is, we can
-            // use that length to optimize subsequent matching phases.
-            // TODO: profiling shows around 4% gets lost here with high-match count,
-            // if not for the endZ anchor this could be cached with minterm lookup
-            matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind<TInputReader>(input, endPos)) : -1;
+
+            }
             return endPos;
         }
 
@@ -586,14 +619,13 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
         /// `int positionId = c >= 128 ? 0 : mtlookup[c]`;
         /// </summary>
         private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
-                ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+                ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
         {
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
             // can only be used with full array initialized and <= 255 minterms
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
-            int endStateId = endStateIdRef;
             int currStateId = startStateId;
             try
             {
@@ -614,7 +646,6 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                     {
                         {
                             endPos = pos;
-                            endStateId = currStateId;
                             // A match is known to exist.  If that's all we need to know, we're done.
                             if (mode == RegexRunnerMode.ExistenceRequired)
                             {
@@ -642,7 +673,6 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                         }
                         // the end position (-1) was nullable
                         endPos = pos;
-                        endStateId = currStateId;
                         return mode == RegexRunnerMode.ExistenceRequired;
                     }
 
@@ -655,8 +685,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                endStateIdRef = endStateId;
-                initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
+                initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
             }
         }
 
@@ -666,39 +695,61 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
         /// ~50% difference in performance with removing unnecessary checks alone
         ///
         /// </summary>
-        private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
-                ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+        private bool FindEndPositionDeltasDFANoSkip<TStateHandler, TInputReader, TFindOptimizationsHandler,
+            TNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
+                ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+            where TStateHandler : struct, IStateHandler
+            where TInputReader : struct, IInputReader
+            where TFindOptimizationsHandler : struct, IInitialStateHandler
+            where TNullabilityHandler : struct, INullabilityHandler
         {
+            // initial check for input end to get it out of the loop
+            if (posRef == input.Length)
+            {
+                if (!(_stateFlagsArray[startStateId].IsNullable() ||
+                      _stateArray[startStateId]!.IsNullableFor(
+                          GetPositionKind(-1))))
+                {
+                    return true;
+                }
+
+                // the end position (-1) was nullable
+                endPosRef = posRef;
+                return true;
+            }
+
+            // Debug.Assert(posRef < input.Length, $"input end condition should be handled outside the loop");
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
             // can only be used with full array initialized and <= 255 minterms
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
-            int endStateId = endStateIdRef;
             int currStateId = startStateId;
             // ldfld only once
-            // int deadStateId = _deadStateId;
+            int deadStateId = _deadStateId;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
-                // The goal is to make this loop as fast as it can possible be,
+                // The goal is to make this loop as fast as it can possibly be,
                 // every single piece of overhead should be removed here
                 // there should be not a single callvirt instruction in the loop
                 // ldfld only if necessary (e.g. a reference changes)
                 // no memory writes unless necessary
                 while (true)
                 {
-                    if (currStateId == _deadStateId)
+                    if (currStateId == deadStateId)
                     {
                         return true;
                     }
+
+                    // acceleratedstatehandler
+
                     // If the state is nullable for the next character, we found a potential end state.
                     // note: the double array lookup is important here, storing a local variable is expensive
                     if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]]))
                     {
                         {
                             endPos = pos;
-                            endStateId = currStateId;
                             // A match is known to exist.  If that's all we need to know, we're done.
                             if (mode == RegexRunnerMode.ExistenceRequired)
                             {
@@ -726,7 +777,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                         }
                         // the end position (-1) was nullable
                         endPos = pos;
-                        endStateId = currStateId;
                         return mode == RegexRunnerMode.ExistenceRequired;
                     }
 
@@ -739,8 +789,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                endStateIdRef = endStateId;
-                initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
+                initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
             }
         }
 
@@ -764,7 +813,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
         private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -773,18 +822,19 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            int endStateId = endStateIdRef;
             int initialStatePos = initialStatePosRef;
             int initialStatePosCandidate = initialStatePosCandidateRef;
+            int deadStateId = _deadStateId;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
                 while (true)
                 {
-                    if (state.DfaStateId == _deadStateId)
+                    if (state.DfaStateId == deadStateId)
                     {
                         return true;
                     }
+                    // TAcceleratedStateHandler.TryNextPosition
                     if (_canBeAcceleratedArray[state.DfaStateId])
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
@@ -798,10 +848,11 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
-                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state,
+                            positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         endPos = pos;
-                        endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
+                        // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
                         initialStatePos = initialStatePosCandidate;
 
                         // A match is known to exist.  If that's all we need to know, we're done.
@@ -812,7 +863,8 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     }
 
                     // If there is more input available try to transition with the next character.
-                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state,
+                            positionId))
                     {
                         return false;
                     }
@@ -826,7 +878,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                endStateIdRef = endStateId;
+                // endStateIdRef = endStateId;
                 initialStatePosRef = initialStatePos;
                 initialStatePosCandidateRef = initialStatePosCandidate;
             }
@@ -851,7 +903,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
         private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -860,7 +912,6 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            int endStateId = endStateIdRef;
             int initialStatePos = initialStatePosRef;
             int initialStatePosCandidate = initialStatePosCandidateRef;
             try
@@ -886,7 +937,6 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
                     if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, flags))
                     {
                         endPos = pos;
-                        endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
                         initialStatePos = initialStatePosCandidate;
 
                         // A match is known to exist.  If that's all we need to know, we're done.
@@ -911,7 +961,6 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                endStateIdRef = endStateId;
                 initialStatePosRef = initialStatePos;
                 initialStatePosCandidateRef = initialStatePosCandidate;
             }
@@ -925,43 +974,22 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
         /// The start position is known to exist; this function just needs to determine exactly what it is.
         /// We need to find the earliest (lowest index) starting position that's not earlier than <paramref name="matchStartBoundary"/>.
         /// </remarks>
+        /// <param name="startState">State to start reversal from</param>
+        /// <param name="initialLastStart">Either valid match start location or -1</param>
         /// <param name="input">The input text.</param>
         /// <param name="i">The ending position to walk backwards from. <paramref name="i"/> points one past the last character of the match.</param>
         /// <param name="matchStartBoundary">The initial starting location discovered in phase 1, a point we must not walk earlier than.</param>
         /// <param name="perThreadData">Per thread data reused between calls.</param>
         /// <returns>The found starting position for the match.</returns>
-        private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, int i, int matchStartBoundary, PerThreadData perThreadData)
+        private int FindStartPosition<TInputReader, TNullabilityHandler>(CurrentState startState, int initialLastStart, ReadOnlySpan<char> input, int i, int matchStartBoundary, PerThreadData perThreadData)
             where TInputReader : struct, IInputReader
             where TNullabilityHandler : struct, INullabilityHandler
         {
             Debug.Assert(i >= 0, $"{nameof(i)} == {i}");
             Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}");
             Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}.");
-
-            // Get the starting state for the reverse pattern. This depends on previous character (which, because we're
-            // going backwards, is character number i).
-            CurrentState currentState;
-            int lastStart = -1; // invalid sentinel value
-            // if possible use optimized reversal instead
-            if (_optimizedReversalState.Item1 > 0)
-            {
-                i -= _optimizedReversalState.Item1;
-                currentState = new CurrentState(_optimizedReversalState.Item2);
-                // anchor variant may need context to be computed if nullable
-                if (_containsAnyAnchor && _nullabilityArray[currentState.DfaStateId] > 0)
-                {
-                    if (TNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
-                            in currentState, TInputReader.GetPositionId(this, input, i),
-                            DfaStateHandler.GetStateFlags(this, in currentState)))
-                    {
-                        lastStart = i;
-                    }
-                }
-            }
-            else
-            {
-                currentState = new CurrentState(_reverseInitialStates[GetCharKind<TInputReader>(input, i)]);
-            }
+            CurrentState currentState = startState;
+            int lastStart = initialLastStart;
 
             // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
             while (true)
@@ -1634,6 +1662,12 @@ private interface IInputReader
             public static abstract int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos);
         }
 
+        private readonly struct OptimizedAsciiInputReader : IInputReader
+        {
+            public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos) =>
+                matcher._mintermClassifier.GetMintermID(input[pos]);
+        }
+
         /// <summary>This reader omits the special handling of \n for the \Z anchor.</summary>
         private readonly struct NoZAnchorInputReader : IInputReader
         {
@@ -1658,15 +1692,47 @@ public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan
             }
         }
 
+
+        private interface IInitialStateHandler
+        {
+            public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
+                where TInputReader : struct, IInputReader;
+        }
+
         /// <summary>
         /// Interface for optimizations to accelerate search from initial states.
         /// </summary>
-        private interface IInitialStateHandler
+        private interface IAcceleratedStateHandler
         {
-            public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
+            public static abstract void TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
                 where TInputReader : struct, IInputReader;
         }
 
+        private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static void TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
+                where TInputReader : struct, IInputReader
+            {
+                // Find the first position that matches with some likely character.
+                if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+                {
+                    // No match exists
+                    state = new CurrentState(matcher.GetState(matcher._deadStateId));
+                    pos = input.Length;
+                    return;
+                }
+
+                // Update the starting state based on where TryFindNextStartingPosition moved us to.
+                // As with the initial starting state, if it's a dead end, no match exists.
+                state = new CurrentState(
+                    matcher._dotstarredInitialStates[matcher.GetCharKind<TInputReader>(input, pos - 1)]);
+                return;
+            }
+        }
+
         /// <summary>
         /// No-op handler for when there are no initial state optimizations to apply.
         /// </summary>

From b10e600dfab71d795f884f00af6ac3f79127d04e Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 18 Jun 2024 19:46:40 +0300
Subject: [PATCH 18/63] refactoring to struct interfaces

---
 .../src/System.Text.RegularExpressions.csproj |   6 +-
 .../Symbolic/MintermClassifier.cs             |  13 +-
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  14 +-
 .../Symbolic/SymbolicRegexMatcher.cs          | 459 ++++++++++++------
 4 files changed, 330 insertions(+), 162 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 0d952017013c0..b3fda5f2f4326 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,9 +5,9 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>
-<!--   documentation warnings -->
-   <NoWarn>$(NoWarn);CS1574</NoWarn>
+<!--   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>-->
+<!--&lt;!&ndash;   documentation warnings &ndash;&gt;-->
+<!--   <NoWarn>$(NoWarn);CS1574</NoWarn>-->
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index d3a0933c18433..96cd72c4c0c95 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -27,11 +27,11 @@ internal sealed class MintermClassifier
         /// <summary>Conserve memory if pattern is ascii-only</summary>
         private readonly bool _isAsciiOnly;
 
-        /// <summary>
-        /// fallback lookup if over 255 minterms
-        /// this is almost never used
-        /// </summary>
-        private readonly int[]? _intLookup;
+        // /// <summary>
+        // /// fallback lookup if over 255 minterms
+        // /// this is almost never used
+        // /// </summary>
+        // private readonly int[]? _intLookup;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
@@ -111,7 +111,8 @@ public int GetMintermID(int c)
             }
             // high performance variant would use a span directly.
             // additional memory is saved by using a byte
-            return _intLookup is null ? _lookup![c] : _intLookup[c];
+            return _lookup![c];
+            // return _intLookup is null ? _lookup![c] : _intLookup[c];
         }
 
         /// <summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 1ef89b006fef0..eac516c43bcdd 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -60,13 +60,13 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         private bool[] _canBeAcceleratedArray;
 
 #if DEBUG
-        private readonly Action<string> _wout = st =>
-        {
-            var a_cons = System.Reflection.Assembly.Load("System.Console");
-            var t_cons = a_cons.GetType("System.Console")!;
-            var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
-            wl!.Invoke(null, [st]);
-        };
+        // private readonly Action<string> _wout = st =>
+        // {
+        //     var a_cons = System.Reflection.Assembly.Load("System.Console");
+        //     var t_cons = a_cons.GetType("System.Console")!;
+        //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
+        //     wl!.Invoke(null, [st]);
+        // };
 #endif
         /// <summary>
         /// The transition function for DFA mode.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 30fc1be98abdf..3f0543772b52a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -163,7 +163,7 @@ public static SymbolicRegexMatcher<TSet> Create(
         }
 
         /// <summary>Constructs matcher for given symbolic regex.</summary>
-        private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> bddBuilder, SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
+        private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
         {
             Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}");
 
@@ -226,8 +226,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> bddBuilder, SymbolicRegex
                             (fds.Chars.Length > 4 &&
                             Array.Exists(fds.Chars, char.IsAsciiLetterLower)),
                         { Range: not null } => false,
+                        // for fixed length strings just trust the optimizations
                         _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength,
-                            // false
                     };
                 });
                 // a DFA is sometimes 10x-100x faster than the optimizations
@@ -408,20 +408,53 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // It returns NoMatchExists (-2) when there is no match.
             // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
             // the position of the last b: aacaaaabbbc.  It additionally records the position of the first a after
-            // the c as the low boundary for the starting position.
-            // int matchStartLowBoundary, matchStartLengthMarker;
-            int matchStartLowBoundary;
-            int matchEnd = (_pattern._info.ContainsEndZAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
+            // the c as the low boundary for the starting position.d
+            // int matchStartLowBoundary = startat;
+            int matchEnd;
+            if (!_containsEndZAnchor)
             {
-                (true, true, true) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (true, true, false) => FindEndPosition<FullInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (true, false, true) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (true, false, false) => FindEndPosition<FullInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (false, true, true) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (false, true, false) => FindEndPosition<NoZAnchorInputReader, InitialStateFindOptimizationsHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (false, false, true) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-                (false, false, false) => FindEndPosition<NoZAnchorInputReader, NoOptimizationsInitialStateHandler, NoAnchorsNullabilityHandler>(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData),
-            };
+                bool isAsciiOnly = _mintermClassifier.IsAsciiOnly();
+                matchEnd = (isAsciiOnly, _findOpts is not null, _containsAnyAnchor) switch
+                {
+                    (true, true, true) =>
+                        FindEndPositionOptimized<OptimizedAsciiInputReader, AcceleratedStateHandler,
+                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (true, true, false) =>
+                        FindEndPositionOptimized<OptimizedAsciiInputReader, NoAnchorAcceleratedStateHandler,
+                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (true, false, false) =>
+                        FindEndPositionOptimized<OptimizedAsciiInputReader, NoAcceleratedStateHandler,
+                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (true, false, true) =>
+                        FindEndPositionOptimized<OptimizedAsciiInputReader, NoAcceleratedStateHandler,
+                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (false, true, false) =>
+                        FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAnchorAcceleratedStateHandler,
+                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (false, true, true) =>
+                        FindEndPositionOptimized<OptimizedUnicodeInputReader, AcceleratedStateHandler,
+                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (false, false, true) =>
+                        FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAcceleratedStateHandler,
+                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (false, false, false) =>
+                        FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAcceleratedStateHandler,
+                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                };
+            }
+            else
+            {
+                // fallback for EndZ anchor
+                matchEnd = (_findOpts is not null) switch
+                {
+                    true =>
+                        FindEndPositionFallback<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(
+                            input, startat, timeoutOccursAt, mode, perThreadData),
+                    false =>
+                        FindEndPositionFallback<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(
+                            input, startat, timeoutOccursAt, mode, perThreadData),
+                };
+            }
 
             // If there wasn't a match, we're done.
             if (matchEnd == NoMatchExists)
@@ -453,10 +486,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                 case MatchReversalKind.PartialFixedLength:
                     int initialLastStart = -1; // invalid sentinel value
                     int i = matchEnd;
-                    uint charKind2 = GetCharKind<FullInputReader>(input, matchEnd);
                     CurrentState reversalStartState;
-
-                    // _containsAnyAnchor
                     if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength)
                     {
                         i -= _optimizedReversalState.FixedLength;
@@ -474,26 +504,25 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                     }
                     else
                     {
-                        reversalStartState = new CurrentState(_reverseInitialStates[charKind2]);
+                        reversalStartState = new CurrentState(_reverseInitialStates[
+                            GetCharKind<FullInputReader>(input, matchEnd)]);
                     }
-                    uint charKind = GetCharKind<FullInputReader>(input, matchEnd);
-                    var startState = new CurrentState(_reverseInitialStates[charKind]);
                     matchStart = matchEnd < startat
                         ? startat
                     : (_containsEndZAnchor, _containsAnyAnchor) switch
                     {
                         (true, true) =>
                             FindStartPosition<FullInputReader, FullNullabilityHandler>(
-                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
                         (true, false) =>
                             FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(
-                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
                         (false, true) =>
                             FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(
-                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
                         (false, false) =>
                             FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(
-                            startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData),
+                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
                     };
                     break;
                 default:
@@ -511,34 +540,94 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             }
             else
             {
-                Registers endRegisters = _pattern._info.ContainsLineAnchor ?
+                Registers endRegisters = _containsAnyAnchor ?
                     FindSubcaptures<FullInputReader>(input, matchStart, matchEnd, perThreadData) :
                     FindSubcaptures<NoZAnchorInputReader>(input, matchStart, matchEnd, perThreadData);
                 return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
             }
         }
 
+        private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
+            where TOptimizedInputReader : struct, IOptimizedInputReader
+            where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
+            where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
+        {
+            int initialStatePosCandidate = pos;
+            var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<FullInputReader>(input, pos - 1)]);
+            int endPos = NoMatchExists;
+
+            while (true)
+            {
+                const int CharsPerTimeoutCheck = 1_000;
+                // TODO: maybe this should be for NFA mode only
+                int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
+                    pos + CharsPerTimeoutCheck :
+                    input.Length;
+
+                bool done;
+                if (currentState.NfaState is null)
+                    done =
+                        FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
+                            TAcceleratedStateHandler,
+                            TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
+                            currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
+                            ref initialStatePosCandidate);
+                else
+                    // nfa fallback check
+                    // assume \Z and full nullability for nfa since it's already extremely rare to get here
+                    done =
+                        FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
+                            FullNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
+                            ref initialStatePosCandidate, ref initialStatePosCandidate);
+
+                // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
+                // there is no more input available, then the whole search is done.
+                if (done || pos >= input.Length)
+                {
+                    break;
+                }
+
+                // The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and
+                // need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more
+                // input available.
+                if (pos < innerLoopLength)
+                {
+                    // Because there was still more input available, a failure to transition in DFA mode must be the cause
+                    // of the early exit. Upgrade to NFA mode.
+                    NfaMatchingState nfaState = perThreadData.NfaState;
+                    nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
+                    currentState = new CurrentState(nfaState);
+                }
+
+                // Check for a timeout before continuing.
+                if (_checkTimeout)
+                {
+                    CheckTimeout(timeoutOccursAt);
+                }
+            }
+            return endPos;
+        }
+
         /// <summary>Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call.</summary>
         /// <param name="input">The input text.</param>
         /// <param name="pos">The starting position in <paramref name="input"/>.</param>
         /// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param>
         /// <param name="mode">The mode of execution based on the regex operation being performed.</param>
-        /// <param name="initialStatePos">The last position the initial state of <see cref="_dotStarredPattern"/> was visited before the end position was found.</param>
         /// <param name="perThreadData">Per thread data reused between calls.</param>
         /// <returns>
         /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists.
         /// </returns>
-        private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, PerThreadData perThreadData)
+        private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
             where TNullabilityHandler : struct, INullabilityHandler
         {
-            initialStatePos = pos;
             int initialStatePosCandidate = pos;
 
             var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<TInputReader>(input, pos - 1)]);
 
             int endPos = NoMatchExists;
+            int endStateId = -1;
 
             while (true)
             {
@@ -549,40 +638,14 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                 // still check the timeout now and again to provide some semblance of the behavior a developer experiences with
                 // the backtracking engines.  We can, however, choose a large number here, since it's not actually needed for security.
                 const int CharsPerTimeoutCheck = 1_000;
-                // TODO: maybe this should be for NFA mode only
                 int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
                     pos + CharsPerTimeoutCheck :
                     input.Length;
 
+                bool done = currentState.NfaState is not null ?
+                    FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate) :
+                    FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate);
 
-
-                bool done;
-                if (currentState.NfaState is not null)
-                    // nfa fallback check
-                    done = FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate);
-                // else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null)
-                else if (_findOpts is null && !_containsEndZAnchor)
-                {
-                    done =
-                        // _mintermClassifier.IsAsciiOnly()
-                        // ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1,
-                        //     mode, ref pos,
-                        //     currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos,
-                        //     ref initialStatePosCandidate)
-                        // if there are no edge cases then use the quicker loop
-                        // :
-                        FindEndPositionDeltasDFANoSkip<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
-                        currentState.DfaStateId, ref endPos, ref initialStatePos,
-                        ref initialStatePosCandidate);
-                }
-                else
-                {
-                    // dfa loop with potential skipping
-                    done = FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate);
-                }
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
                 if (done || pos >= input.Length)
@@ -607,73 +670,90 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
                 {
                     CheckTimeout(timeoutOccursAt);
                 }
-
-
             }
+
+            // Check whether there's a fixed-length marker for the current state.  If there is, we can
+            // use that length to optimize subsequent matching phases.
+            // matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind<TInputReader>(input, endPos)) : -1;
             return endPos;
         }
 
         /// <summary>
-        /// Ascii-only variant of the hot loop to conserve memory.
-        /// Only major difference is the minterm lookup:
-        /// `int positionId = c >= 128 ? 0 : mtlookup[c]`;
+        /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
+        /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
+        /// lazily building out the graph as needed.
         /// </summary>
-        private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
-                ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+        /// <remarks>
+        /// The <typeparamref name="TStateHandler"/> supplies the actual transitioning logic, controlling whether processing is
+        /// performed in DFA mode or in NFA mode.  However, it expects <paramref name="state"/> to be configured to match,
+        /// so for example if <typeparamref name="TStateHandler"/> is a <see cref="DfaStateHandler"/>, it expects the <paramref name="state"/>'s
+        /// <see cref="CurrentState.DfaStateId"/> to be non-negative and its <see cref="CurrentState.NfaState"/> to be null; vice versa for
+        /// <see cref="NfaStateHandler"/>.
+        /// </remarks>
+        /// <returns>
+        /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch.
+        /// 0 if iteration completed because we reached an initial state.
+        /// A negative value if iteration completed because we ran out of input or we failed to transition.
+        /// </returns>
+        private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+            where TStateHandler : struct, IStateHandler
+            where TInputReader : struct, IInputReader
+            where TFindOptimizationsHandler : struct, IInitialStateHandler
+            where TNullabilityHandler : struct, INullabilityHandler
         {
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            // can only be used with full array initialized and <= 255 minterms
-            byte[] mtlookup = _mintermClassifier.ByteLookup()!;
-            int currStateId = startStateId;
+            // int endStateId = endStateIdRef;
+            int initialStatePos = initialStatePosRef;
+            int initialStatePosCandidate = initialStatePosCandidateRef;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
                 while (true)
                 {
-                    if (currStateId == _deadStateId)
+                    StateFlags flags = TStateHandler.GetStateFlags(this, in state);
+
+                    // Check if currentState represents an initial state. If it does, call into any possible find optimizations
+                    // to hopefully more quickly find the next possible starting location.
+                    if (flags.IsInitial())
+                    {
+                        if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
+                        {
+                            return true;
+                        }
+
+                        initialStatePosCandidate = pos;
+                    }
+
+                    // If the state is a dead end, such that we can't transition anywhere else, end the search.
+                    if (state.DfaStateId == _deadStateId)
                     {
                         return true;
                     }
 
-                    int c = input[pos];
-                    int positionId = c >= 128 ? 0 : mtlookup[c];
+                    int positionId = TInputReader.GetPositionId(this, input, pos);
 
-                    // If the state is nullable for the next character we found a potential end state.
-                    // note: the double array lookup is important here, storing a local variable is expensive
-                    if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, positionId))
+                    // If the state is nullable for the next character, meaning it accepts the empty string,
+                    // we found a potential end state.
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, flags))
                     {
+                        endPos = pos;
+                        // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
+                        initialStatePos = initialStatePosCandidate;
+
+                        // A match is known to exist.  If that's all we need to know, we're done.
+                        if (mode == RegexRunnerMode.ExistenceRequired)
                         {
-                            endPos = pos;
-                            // A match is known to exist.  If that's all we need to know, we're done.
-                            if (mode == RegexRunnerMode.ExistenceRequired)
-                            {
-                                return true;
-                            }
+                            return true;
                         }
                     }
 
                     // If there is more input available try to transition with the next character.
-                    // Note: the order here is important so the transition gets taken
-                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)|| pos >= lengthMinus1)
+                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
                     {
-                        if (pos + 1 < input.Length)
-                        {
-                            return false;
-                        }
-                        pos++;
-                        // one off check for the final position
-                        // this is just to move it out of the hot loop
-                        if (!(_stateFlagsArray[currStateId].IsNullable() ||
-                             _stateArray[currStateId]!.IsNullableFor(
-                                 GetPositionKind(-1))))
-                        {
-                            return false;
-                        }
-                        // the end position (-1) was nullable
-                        endPos = pos;
-                        return mode == RegexRunnerMode.ExistenceRequired;
+                        return false;
                     }
 
                     // We successfully transitioned, so update our current input index to match.
@@ -685,56 +765,47 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
+                // endStateIdRef = endStateId;
+                initialStatePosRef = initialStatePos;
+                initialStatePosCandidateRef = initialStatePosCandidate;
             }
         }
 
+
         /// <summary>
-        /// TODO: this is essentially a stripped down version when there's no good prefix optimizations
-        /// i don't trust the compiler to optimize this and it makes a
-        /// ~50% difference in performance with removing unnecessary checks alone
-        ///
+        /// tbd
         /// </summary>
-        private bool FindEndPositionDeltasDFANoSkip<TStateHandler, TInputReader, TFindOptimizationsHandler,
-            TNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
+        private bool FindEndPositionDeltasDFAOptimized<TOptimizedInputReader, TAcceleratedStateHandler,
+            TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
                 ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
-            where TStateHandler : struct, IStateHandler
-            where TInputReader : struct, IInputReader
-            where TFindOptimizationsHandler : struct, IInitialStateHandler
-            where TNullabilityHandler : struct, INullabilityHandler
+            where TOptimizedInputReader : struct, IOptimizedInputReader
+            where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
+            where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
             // initial check for input end to get it out of the loop
             if (posRef == input.Length)
             {
                 if (!(_stateFlagsArray[startStateId].IsNullable() ||
-                      _stateArray[startStateId]!.IsNullableFor(
-                          GetPositionKind(-1))))
+                      _stateArray[startStateId]!.IsNullableFor(GetPositionKind(-1))))
                 {
                     return true;
                 }
-
-                // the end position (-1) was nullable
+                // the end position kind (-1) was nullable
                 endPosRef = posRef;
                 return true;
             }
 
-            // Debug.Assert(posRef < input.Length, $"input end condition should be handled outside the loop");
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
             int endPos = endPosRef;
-            // can only be used with full array initialized and <= 255 minterms
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
             int currStateId = startStateId;
-            // ldfld only once
             int deadStateId = _deadStateId;
+            int initialStateId = _dotstarredInitialStates[CharKind.General].Id;
             try
             {
-                // Loop through each character in the input, transitioning from state to state for each.
                 // The goal is to make this loop as fast as it can possibly be,
                 // every single piece of overhead should be removed here
-                // there should be not a single callvirt instruction in the loop
-                // ldfld only if necessary (e.g. a reference changes)
-                // no memory writes unless necessary
                 while (true)
                 {
                     if (currStateId == deadStateId)
@@ -742,11 +813,18 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
                         return true;
                     }
 
-                    // acceleratedstatehandler
+                    if (TAcceleratedStateHandler.TryFindNextStartingPosition<TOptimizedInputReader>(
+                    this, mtlookup, input, ref currStateId, ref pos, initialStateId))
+                    {
+                        // future work could combine this with an immediate state transition
+                        // but this requires changing too much for now
+                        continue;
+                    }
 
                     // If the state is nullable for the next character, we found a potential end state.
                     // note: the double array lookup is important here, storing a local variable is expensive
-                    if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]]))
+                    // if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]]))
+                    if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(this, _nullabilityArray, currStateId, mtlookup, input, pos))
                     {
                         {
                             endPos = pos;
@@ -760,7 +838,9 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
 
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition gets taken
-                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]])|| pos >= lengthMinus1)
+                    if (!DfaStateHandler.TryTakeDFATransition(
+                    this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, input, pos))
+                        || pos >= lengthMinus1)
                     {
                         if (pos + 1 < input.Length)
                         {
@@ -796,7 +876,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
 
         /// <summary>
         /// TODO: this is a separate DFA function that takes advantage of short circuit array lookups
-        /// Workhorse inner loop for <see cref="FindEndPosition"/>.  Consumes the <paramref name="input"/> character by character,
+        /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
         /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
         /// </summary>
@@ -886,7 +966,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
         /// <summary>
         /// TODO: this is the fallback NFA function
-        /// Workhorse inner loop for <see cref="FindEndPosition"/>.  Consumes the <paramref name="input"/> character by character,
+        /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
         /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
         /// </summary>
@@ -921,9 +1001,6 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
                 {
                     StateFlags flags = TStateHandler.GetStateFlags(this, in state);
 
-                    // TFindOptimizationsHandler is redundant here as
-                    // going into NFA mode signals something already exploded
-
                     // Dead end here means the set is empty
                     if (state.NfaState!.NfaStateSet.Count == 0)
                     {
@@ -1650,6 +1727,58 @@ public static void UndoTransition(ref CurrentState state)
 #endif
         }
 
+        private interface IOptimizedInputReader
+        {
+            public static abstract int GetPositionId(byte[] lookup, ReadOnlySpan<char> input,
+                int pos);
+        }
+
+        private readonly struct OptimizedAsciiInputReader : IOptimizedInputReader
+        {
+            public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos)
+            {
+                Debug.Assert(pos < input.Length);
+                return input[pos] >= 128 ? 0 : lookup[input[pos]];
+            }
+        }
+
+        private readonly struct OptimizedUnicodeInputReader : IOptimizedInputReader
+        {
+            public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos)
+            {
+                Debug.Assert(pos < input.Length);
+                Debug.Assert(lookup.Length == (ushort.MaxValue + 1));
+                return lookup[input[pos]];
+            }
+        }
+
+        private interface IOptimizedNullabilityHandler
+        {
+            public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                byte[] nullabilityArray, int
+                    currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+                where TOptimizedInputReader : struct, IOptimizedInputReader;
+        }
+
+        private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
+        {
+            public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+                where TOptimizedInputReader : struct, IOptimizedInputReader
+            {
+                return nullabilityArray[currStateId] > 0;
+            }
+        }
+
+        private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
+        {
+            public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+                where TOptimizedInputReader : struct, IOptimizedInputReader
+            {
+                return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, input, pos));
+            }
+        }
+
         /// <summary>
         /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to
         /// both take transitions and decide nullability. For positions of valid characters that are handled normally,
@@ -1662,11 +1791,7 @@ private interface IInputReader
             public static abstract int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos);
         }
 
-        private readonly struct OptimizedAsciiInputReader : IInputReader
-        {
-            public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos) =>
-                matcher._mintermClassifier.GetMintermID(input[pos]);
-        }
+
 
         /// <summary>This reader omits the special handling of \n for the \Z anchor.</summary>
         private readonly struct NoZAnchorInputReader : IInputReader
@@ -1701,35 +1826,77 @@ public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRe
         }
 
         /// <summary>
-        /// Interface for optimizations to accelerate search from initial states.
+        /// Interface for accelerated states, returns true if position was changed
         /// </summary>
         private interface IAcceleratedStateHandler
         {
-            public static abstract void TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
-                where TInputReader : struct, IInputReader;
+            public static abstract bool TryFindNextStartingPosition<TOptimizedInputReader>(
+                SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input, ref
+                int currentStateId, ref int pos, int initialStateId)
+                where TOptimizedInputReader : struct, IOptimizedInputReader;
         }
 
+        private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                byte[] lookup,
+                ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
+                where TOptimizedInputReader : struct, IOptimizedInputReader
+
+            {
+                // if (!matcher._canBeAcceleratedArray[currentStateId])
+                if (currentStateId != initialStateId)
+                    return false;
+                // Find the first position that matches with some likely character.
+                if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+                {
+                    return false;
+                }
+
+                // No match exists
+                currentStateId = matcher._deadStateId;
+                pos = input.Length;
+                return true;
+            }
+        }
         private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static void TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher,
-                ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
-                where TInputReader : struct, IInputReader
+            public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                byte[] lookup,
+                ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
+                where TOptimizedInputReader : struct, IOptimizedInputReader
+
             {
+                // if (!matcher._canBeAcceleratedArray[currentStateId])
+                if (currentStateId != initialStateId)
+                    return false;
+                // int tempPos = pos;
                 // Find the first position that matches with some likely character.
                 if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     // No match exists
-                    state = new CurrentState(matcher.GetState(matcher._deadStateId));
+                    currentStateId = matcher._deadStateId;
                     pos = input.Length;
-                    return;
+                    return true;
                 }
+                currentStateId = matcher._dotstarredInitialStates[
+                    matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1]
+                    ].Id;
+                return false;
+            }
+        }
 
-                // Update the starting state based on where TryFindNextStartingPosition moved us to.
-                // As with the initial starting state, if it's a dead end, no match exists.
-                state = new CurrentState(
-                    matcher._dotstarredInitialStates[matcher.GetCharKind<TInputReader>(input, pos - 1)]);
-                return;
+        private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+                byte[] lookup,
+                ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
+                where TOptimizedInputReader : struct, IOptimizedInputReader
+            {
+                return false;
             }
         }
 

From d68bd3c4eb01b414147059b4aed230dde29367f8 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 18 Jun 2024 21:06:32 +0300
Subject: [PATCH 19/63] refactoring optimizations

---
 .../Symbolic/MatchReversal.cs                 | 18 +++--
 .../Symbolic/MintermClassifier.cs             |  5 +-
 .../Symbolic/SymbolicRegexMatcher.cs          | 66 +++++++++----------
 3 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
index b7be92195ee58..215aa65a1d14f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
@@ -3,15 +3,13 @@
 
 namespace System.Text.RegularExpressions.Symbolic;
 
-internal sealed class MatchReversal<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
+internal sealed class MatchReversal<TSet>(
+    MatchReversalKind kind,
+    int fixedLength,
+    MatchingState<TSet>? adjustedStartState = null)
+    where TSet : IComparable<TSet>, IEquatable<TSet>
 {
-    public MatchReversal(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
-    {
-        Kind = kind;
-        FixedLength = fixedLength;
-        AdjustedStartState = adjustedStartState;
-    }
-    internal MatchReversalKind Kind { get; }
-    internal int FixedLength { get; }
-    internal MatchingState<TSet>? AdjustedStartState { get; }
+    internal MatchReversalKind Kind { get; } = kind;
+    internal int FixedLength { get; } = fixedLength;
+    internal MatchingState<TSet>? AdjustedStartState { get; } = adjustedStartState;
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 96cd72c4c0c95..e6352395c670d 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -48,13 +48,10 @@ public MintermClassifier(BDD[] minterms)
             }
 
             // ascii-only array to save memory
-            // int mintermId = c >= 128 ? 0 : mtlookup[c];
-            // _isAsciiOnly = true;
             _isAsciiOnly = true;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
-                (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                if (mintermRanges[^1].Item2 >= 128)
+                if (BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2 >= 128)
                 {
                     _isAsciiOnly = false;
                 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 3f0543772b52a..0f0e5b254c113 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -84,6 +84,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
         private readonly int _deadStateId;
 
+        /// <summary>Initial state used to for vectorization</summary>
+        private readonly int _initialStateId;
+
         /// <summary>Whether the pattern contains any anchor</summary>
         private readonly bool _containsAnyAnchor;
 
@@ -255,8 +258,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
             // The loops below and how character kinds are calculated assume that the "general" character kind is zero
             Debug.Assert(CharKind.General == 0);
 
-            // Assign dead state id
-            _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
 
             // Assign edge case info for quick lookup
             _containsAnyAnchor = _pattern._info.ContainsSomeAnchor;
@@ -284,6 +285,11 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
             }
             _dotstarredInitialStates = dotstarredInitialStates;
 
+            // Assign dead state id
+            _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
+            // Assign initial state id
+            _initialStateId = _dotstarredInitialStates[CharKind.General].Id;
+
             // Create the reverse pattern (the original pattern in reverse order) and all of its
             // initial states. Also disable backtracking simulation to ensure the reverse path from
             // the final state that was found is followed. Not doing so might cause the earliest
@@ -573,8 +579,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                             currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
                             ref initialStatePosCandidate);
                 else
-                    // nfa fallback check
-                    // assume \Z and full nullability for nfa since it's already extremely rare to get here
+                    // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
                     done =
                         FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
                             FullNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
@@ -785,13 +790,11 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
             // initial check for input end to get it out of the loop
             if (posRef == input.Length)
             {
-                if (!(_stateFlagsArray[startStateId].IsNullable() ||
-                      _stateArray[startStateId]!.IsNullableFor(GetPositionKind(-1))))
+                if (_stateArray[startStateId]!.IsNullableFor(_positionKinds[0]))
                 {
-                    return true;
+                    // the end position kind was nullable
+                    endPosRef = posRef;
                 }
-                // the end position kind (-1) was nullable
-                endPosRef = posRef;
                 return true;
             }
 
@@ -801,7 +804,7 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
             int currStateId = startStateId;
             int deadStateId = _deadStateId;
-            int initialStateId = _dotstarredInitialStates[CharKind.General].Id;
+            int initialStateId = _initialStateId;
             try
             {
                 // The goal is to make this loop as fast as it can possibly be,
@@ -822,17 +825,13 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
                     }
 
                     // If the state is nullable for the next character, we found a potential end state.
-                    // note: the double array lookup is important here, storing a local variable is expensive
-                    // if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]]))
                     if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(this, _nullabilityArray, currStateId, mtlookup, input, pos))
                     {
+                        endPos = pos;
+                        // A match is known to exist.  If that's all we need to know, we're done.
+                        if (mode == RegexRunnerMode.ExistenceRequired)
                         {
-                            endPos = pos;
-                            // A match is known to exist.  If that's all we need to know, we're done.
-                            if (mode == RegexRunnerMode.ExistenceRequired)
-                            {
-                                return true;
-                            }
+                            return true;
                         }
                     }
 
@@ -850,8 +849,8 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
                         // one off check for the final position
                         // this is just to move it out of the hot loop
                         if (!(_stateFlagsArray[currStateId].IsNullable() ||
-                             _stateArray[currStateId]!.IsNullableFor(
-                                 GetPositionKind(-1))))
+                              _stateArray[currStateId]!.IsNullableFor(
+                                  GetPositionKind(-1))))
                         {
                             return false;
                         }
@@ -859,7 +858,6 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
                         endPos = pos;
                         return mode == RegexRunnerMode.ExistenceRequired;
                     }
-
                     // We successfully transitioned, so update our current input index to match.
                     pos++;
                 }
@@ -1735,6 +1733,7 @@ public static abstract int GetPositionId(byte[] lookup, ReadOnlySpan<char> input
 
         private readonly struct OptimizedAsciiInputReader : IOptimizedInputReader
         {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos)
             {
                 Debug.Assert(pos < input.Length);
@@ -1744,6 +1743,7 @@ public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos
 
         private readonly struct OptimizedUnicodeInputReader : IOptimizedInputReader
         {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos)
             {
                 Debug.Assert(pos < input.Length);
@@ -1762,6 +1762,7 @@ public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatch
 
         private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
         {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
@@ -1771,6 +1772,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
 
         private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
         {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
                 byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
@@ -1848,7 +1850,6 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 // if (!matcher._canBeAcceleratedArray[currentStateId])
                 if (currentStateId != initialStateId)
                     return false;
-                // Find the first position that matches with some likely character.
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     return false;
@@ -1872,19 +1873,18 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 // if (!matcher._canBeAcceleratedArray[currentStateId])
                 if (currentStateId != initialStateId)
                     return false;
-                // int tempPos = pos;
-                // Find the first position that matches with some likely character.
-                if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+                if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
-                    // No match exists
-                    currentStateId = matcher._deadStateId;
-                    pos = input.Length;
-                    return true;
-                }
-                currentStateId = matcher._dotstarredInitialStates[
-                    matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1]
+                    currentStateId = matcher._dotstarredInitialStates[
+                        matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1]
                     ].Id;
-                return false;
+                    return false;
+                }
+
+                // No match exists
+                currentStateId = matcher._deadStateId;
+                pos = input.Length;
+                return true;
             }
         }
 

From 153dfc30ea0bb6e96a19e3e4243834fe0f2b60df Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 19 Jun 2024 02:29:56 +0300
Subject: [PATCH 20/63] fallback mode and bugfix

---
 .../src/System.Text.RegularExpressions.csproj |  6 +--
 .../Symbolic/MintermClassifier.cs             | 46 ++++++++++---------
 .../Symbolic/SymbolicRegexMatcher.cs          | 41 ++++++++++-------
 .../FunctionalTests/NonBacktrackingTests.cs   | 22 ---------
 ...ystem.Text.RegularExpressions.Tests.csproj |  1 -
 5 files changed, 51 insertions(+), 65 deletions(-)
 delete mode 100644 src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index b3fda5f2f4326..0d952017013c0 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,9 +5,9 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-<!--   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>-->
-<!--&lt;!&ndash;   documentation warnings &ndash;&gt;-->
-<!--   <NoWarn>$(NoWarn);CS1574</NoWarn>-->
+   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>
+<!--   documentation warnings -->
+   <NoWarn>$(NoWarn);CS1574</NoWarn>
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index e6352395c670d..96ab22ce8f967 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -31,7 +31,7 @@ internal sealed class MintermClassifier
         // /// fallback lookup if over 255 minterms
         // /// this is almost never used
         // /// </summary>
-        // private readonly int[]? _intLookup;
+        private readonly int[]? _intLookup;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
@@ -57,28 +57,24 @@ public MintermClassifier(BDD[] minterms)
                 }
             }
 
-            // i have never seen a regex use over 80 minterms not to speak of 255,
-            // but it's there as a fallback mechanism
+            // It's incredibly rare for a regex to use more than a hundred or two minterms,
+            // but we need a fallback just in case.
             if (minterms.Length > 255)
             {
-                // WIP: temporary exception to see if any tests in the pipeline reach this
-                // if nothing reaches this perhaps it'd be easier to just throw an exception
-                // during construction
-                throw new Exception($"reached over 255 minterms, count {minterms}");
                 // over 255 unique sets also means it's never ascii only
-                // int[] lookup = new int[ushort.MaxValue + 1];
-                // for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
-                // {
-                //     // precompute all assigned minterm categories
-                //     (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                //     foreach ((uint start, uint end) in mintermRanges)
-                //     {
-                //         // assign character ranges in bulk
-                //         Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                //         slice.Fill(mintermId);
-                //     }
-                // }
-                // _intLookup = lookup;
+                int[] lookup = new int[ushort.MaxValue + 1];
+                for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+                {
+                    // precompute all assigned minterm categories
+                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                    foreach ((uint start, uint end) in mintermRanges)
+                    {
+                        // assign character ranges in bulk
+                        Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
+                        slice.Fill(mintermId);
+                    }
+                }
+                _intLookup = lookup;
             }
             else
             {
@@ -108,8 +104,7 @@ public int GetMintermID(int c)
             }
             // high performance variant would use a span directly.
             // additional memory is saved by using a byte
-            return _lookup![c];
-            // return _intLookup is null ? _lookup![c] : _intLookup[c];
+            return _intLookup is null ? _lookup![c] : _intLookup[c];
         }
 
         /// <summary>
@@ -126,5 +121,12 @@ public int GetMintermID(int c)
         /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public byte[]? ByteLookup() => _lookup;
+
+        /// <summary>
+        /// Int lookup for rare cases
+        /// </summary>
+        /// <returns></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int[]? IntLookup() => _intLookup;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 0f0e5b254c113..76614272556f6 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -415,12 +415,11 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
             // the position of the last b: aacaaaabbbc.  It additionally records the position of the first a after
             // the c as the low boundary for the starting position.d
-            // int matchStartLowBoundary = startat;
             int matchEnd;
-            if (!_containsEndZAnchor)
+            // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases
+            if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null))
             {
-                bool isAsciiOnly = _mintermClassifier.IsAsciiOnly();
-                matchEnd = (isAsciiOnly, _findOpts is not null, _containsAnyAnchor) switch
+                matchEnd = (_mintermClassifier.IsAsciiOnly(), _findOpts is not null, _containsAnyAnchor) switch
                 {
                     (true, true, true) =>
                         FindEndPositionOptimized<OptimizedAsciiInputReader, AcceleratedStateHandler,
@@ -440,17 +439,17 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                     (false, true, true) =>
                         FindEndPositionOptimized<OptimizedUnicodeInputReader, AcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false, true) =>
+                    (false, false, false) =>
                         FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false, false) =>
+                    (false, false, true) =>
                         FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                 };
             }
             else
             {
-                // fallback for EndZ anchor
+                // fallback for Z anchor or over 255 minterms
                 matchEnd = (_findOpts is not null) switch
                 {
                     true =>
@@ -584,7 +583,6 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                         FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
                             FullNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
                             ref initialStatePosCandidate, ref initialStatePosCandidate);
-
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
                 if (done || pos >= input.Length)
@@ -789,6 +787,7 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
         {
             // initial check for input end to get it out of the loop
             if (posRef == input.Length)
+
             {
                 if (_stateArray[startStateId]!.IsNullableFor(_positionKinds[0]))
                 {
@@ -821,7 +820,17 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
                     {
                         // future work could combine this with an immediate state transition
                         // but this requires changing too much for now
-                        continue;
+                        if (pos == input.Length)
+                        {
+                            // patterns such as ^$ can be nullable right away
+                            if (_stateArray[currStateId]!.IsNullableFor(_positionKinds[0]))
+                            {
+                                // the end position kind was nullable
+                                endPos = pos;
+                            }
+                            currStateId = deadStateId;
+                            continue;
+                        }
                     }
 
                     // If the state is nullable for the next character, we found a potential end state.
@@ -852,11 +861,11 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
                               _stateArray[currStateId]!.IsNullableFor(
                                   GetPositionKind(-1))))
                         {
-                            return false;
+                            return true;
                         }
                         // the end position (-1) was nullable
                         endPos = pos;
-                        return mode == RegexRunnerMode.ExistenceRequired;
+                        return true;
                     }
                     // We successfully transitioned, so update our current input index to match.
                     pos++;
@@ -912,8 +921,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     {
                         return true;
                     }
-                    // TAcceleratedStateHandler.TryNextPosition
-                    if (_canBeAcceleratedArray[state.DfaStateId])
+                    if ((_stateFlagsArray[state.DfaStateId] & StateFlags.IsAcceleratedFlag) != 0)
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
                         {
@@ -1777,6 +1785,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
                 byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
+                Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}");
                 return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, input, pos));
             }
         }
@@ -1847,12 +1856,11 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 where TOptimizedInputReader : struct, IOptimizedInputReader
 
             {
-                // if (!matcher._canBeAcceleratedArray[currentStateId])
                 if (currentStateId != initialStateId)
                     return false;
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
-                    return false;
+                    return true;
                 }
 
                 // No match exists
@@ -1870,7 +1878,6 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 where TOptimizedInputReader : struct, IOptimizedInputReader
 
             {
-                // if (!matcher._canBeAcceleratedArray[currentStateId])
                 if (currentStateId != initialStateId)
                     return false;
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
@@ -1878,7 +1885,7 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                     currentStateId = matcher._dotstarredInitialStates[
                         matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1]
                     ].Id;
-                    return false;
+                    return true;
                 }
 
                 // No match exists
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs
deleted file mode 100644
index 501df78391690..0000000000000
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs
+++ /dev/null
@@ -1,22 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-using System.Collections;
-using System.Collections.Generic;
-using Xunit;
-
-namespace System.Text.RegularExpressions.Tests
-{
-    /// <summary>
-    /// TODO: Create tests here later
-    /// </summary>
-    public static partial class NonBacktrackingTests
-    {
-
-        // [Fact]
-        // public static void Test()
-        // {
-        // }
-
-    }
-}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj
index afdd6f1e51f24..dbab47f63d097 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj
@@ -19,7 +19,6 @@
     <Compile Include="GroupCollectionTests.cs" />
     <Compile Include="MatchCollectionTests.cs" />
     <Compile Include="MonoRegexTests.cs" />
-    <Compile Include="NonBacktrackingTests.cs" />
     <Compile Include="Regex.CompileToAssembly.Tests.cs" />
     <Compile Include="Regex.Ctor.Tests.cs" />
     <Compile Include="Regex.Cache.Tests.cs" />

From 4aebe3e77138ec78d34c5be309100dfa0d35b17b Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 19 Jun 2024 02:30:47 +0300
Subject: [PATCH 21/63] reenable warnings

---
 .../src/System.Text.RegularExpressions.csproj               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 0d952017013c0..b3fda5f2f4326 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,9 +5,9 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>
-<!--   documentation warnings -->
-   <NoWarn>$(NoWarn);CS1574</NoWarn>
+<!--   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>-->
+<!--&lt;!&ndash;   documentation warnings &ndash;&gt;-->
+<!--   <NoWarn>$(NoWarn);CS1574</NoWarn>-->
   </PropertyGroup>
 
   <ItemGroup>

From 1e6f55cb102c312c44001251ad5f2db4fd87bfa7 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 19 Jun 2024 04:00:32 +0300
Subject: [PATCH 22/63] anchor edge case

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs        | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index eac516c43bcdd..10da6f1af060f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -204,6 +204,12 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
             int pos = 0;
             SymbolicRegexNode<TSet>? current = node;
             bool canLoop = true;
+            // finding anchors inside pattern invalidates this optimization
+            var bail = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
+            {
+                pos = 0;
+                return (false, node);
+            });
             var addSingleton = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
             {
                 pos += 1;
@@ -257,6 +263,12 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                         addSingleton(current),
                     {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
                         addFixedLengthLoop(current),
+                    {
+                        _kind: SymbolicRegexNodeKind.Concat,
+                        _left._info.ContainsSomeAnchor:true,
+                        _right._kind: SymbolicRegexNodeKind.Concat
+                    } =>
+                        bail(current),
                     _ => (false, current)
                 };
                 canLoop = loop;

From c6ad3ac3560c865feb9fbda0a48e30e742288a6c Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 19 Jun 2024 23:36:16 +0300
Subject: [PATCH 23/63] anchor edge cases

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs          | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 10da6f1af060f..e5b6cf1d1ed9b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -253,6 +253,10 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
 #endif
                 (bool loop, SymbolicRegexNode<TSet> next) = current switch
                 {
+                    // This could potentially be a very good future optimization for
+                    // anchors but there's too many edge cases to guarantee it works.
+                    // one example which fails currently: pattern: @"\By\b", input: "xy"
+                    { _info.ContainsSomeAnchor: true } => bail(current),
                     // if this is reached then entire match is fixed length
                     { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon),
                     {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} =>
@@ -263,12 +267,6 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                         addSingleton(current),
                     {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
                         addFixedLengthLoop(current),
-                    {
-                        _kind: SymbolicRegexNodeKind.Concat,
-                        _left._info.ContainsSomeAnchor:true,
-                        _right._kind: SymbolicRegexNodeKind.Concat
-                    } =>
-                        bail(current),
                     _ => (false, current)
                 };
                 canLoop = loop;

From e10b43f3ba60b04da8ffcd94b0856cd62ef4a4e6 Mon Sep 17 00:00:00 2001
From: ieviev <36763595+ieviev@users.noreply.github.com>
Date: Thu, 20 Jun 2024 00:26:41 +0300
Subject: [PATCH 24/63] Apply suggestions from code review

Co-authored-by: Stephen Toub <stoub@microsoft.com>
---
 .../Symbolic/MatchingState.cs                 |  7 +-
 .../Symbolic/MintermClassifier.cs             |  1 +
 .../Symbolic/RegexNodeConverter.cs            | 85 -------------------
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  4 +-
 .../Symbolic/SymbolicRegexMatcher.cs          |  8 +-
 5 files changed, 9 insertions(+), 96 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 5bd2baf668d3d..1622107e8d9ce 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -20,7 +20,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         /// <summary>
         /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor
         /// </summary>
-        internal readonly int NullabilityInfo;
+        internal int NullabilityInfo { get; }
 
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
@@ -154,7 +154,8 @@ internal StateFlags BuildStateFlags(bool isInitial)
         }
 
         /// <summary>
-        /// nullability for each context is encoded in a bit
+        /// Builds the nullability information for the matching statie.
+        /// Nullability for each context is encoded in a bit
         /// 0 means node cannot be nullable
         /// 00001 -> nullable for General
         /// 00010 -> nullable for BeginningEnd
@@ -162,7 +163,6 @@ internal StateFlags BuildStateFlags(bool isInitial)
         /// 01000 -> nullable for NewLineS
         /// 10000 -> nullable for WordLetter
         /// </summary>
-        /// <returns></returns>
         internal byte BuildNullabilityInfo()
         {
             byte nullabilityInfo = 0;
@@ -173,6 +173,7 @@ internal byte BuildNullabilityInfo()
                     nullabilityInfo |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0);
                 }
             }
+
             return nullabilityInfo;
         }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 96ab22ce8f967..3ee908ffd0f06 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -102,6 +102,7 @@ public int GetMintermID(int c)
             {
                 return 0;
             }
+
             // high performance variant would use a span directly.
             // additional memory is saved by using a byte
             return _intLookup is null ? _lookup![c] : _intLookup[c];
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
index 31f01271d558b..9194ca00c971c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
@@ -527,90 +527,5 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
                 }
             }
         }
-
-
-        // /// <summary>
-        // /// attempt to remove anchors when possible since it reduces overhead
-        // /// more rewrites could be tried but it's important to preserve PCRE semantics
-        // /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner
-        // /// </summary>
-        // /// <param name="builder"></param>
-        // /// <param name="rootNode"></param>
-        // /// <returns></returns>
-    //     internal static SymbolicRegexNode<BDD> ApplyRootRewrites(SymbolicRegexBuilder<BDD> builder, SymbolicRegexNode<BDD> rootNode)
-    //     {
-    //         // only consider removing anchors, otherwise bail
-    //         if (!rootNode._info.ContainsSomeAnchor) return rootNode;
-
-    //         // Func<string, bool> _wout = st =>
-    //         // {
-    //         //     var a_cons = System.Reflection.Assembly.Load("System.Console");
-    //         //     var t_cons = a_cons.GetType("System.Console")!;
-    //         //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
-    //         //     wl!.Invoke(null, [st]);
-    //         //     return true;
-    //         // };
-
-    //         SymbolicRegexNode<BDD> ApplyRewrites(SymbolicRegexNode<BDD> node)
-    //         {
-    //             // Guard against stack overflow due to deep recursion
-    //             if (!StackHelper.TryEnsureSufficientExecutionStack())
-    //             {
-    //                 return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node));
-    //             }
-
-    //             var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver);
-
-    //             switch (node._kind)
-    //             {
-    //                 case SymbolicRegexNodeKind.Concat:
-    //                     // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}");
-    //                     switch (node._left!._kind)
-    //                     {
-    //                         case SymbolicRegexNodeKind.CaptureStart:
-    //                             return builder.CreateConcat(node._left, ApplyRewrites(node._right!));
-    //                         case SymbolicRegexNodeKind.BoundaryAnchor:
-    //                             return node._right! switch
-    //                             {
-    //                                 // \b\w{1,}.. -> \w{1,}
-    //                                 // anchor to the left can be removed
-    //                                 {
-    //                                     _kind: SymbolicRegexNodeKind.Concat, _left:
-    //                                     {
-    //                                         _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue
-
-    //                                     } wordLoop
-    //                                 }
-    //                                  when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!),
-    //                                 _ => node
-    //                             };
-    //                         case SymbolicRegexNodeKind.Loop:
-    //                             var loopnode = node._left!;
-    //                             // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case
-    //                             bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1;
-    //                             bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl);
-    //                             return node._right! switch
-    //                             {
-    //                                 // anchor to the right can be removed
-    //                                 {
-    //                                     _kind: SymbolicRegexNodeKind.Concat,
-    //                                     _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor,
-    //                                     _right._kind: SymbolicRegexNodeKind.CaptureEnd
-    //                                 } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)),
-    //                                 _ => node
-    //                             };
-    //                     }
-    //                     return node;
-
-
-    //                 default:
-    //                     return node;
-    //             }
-    //         }
-
-    //         SymbolicRegexNode<BDD> rewritten = ApplyRewrites(rootNode);
-    //         // _wout(rewritten.ToString());
-    //         return rewritten;
-    //     }
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index e5b6cf1d1ed9b..2a573d9eee285 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -138,9 +138,6 @@ private static void ArrayResizeAndVolatilePublish<T>(ref T[] array, int newSize)
         /// <summary>
         /// Pre-computed hot-loop version of nullability check
         /// </summary>
-        /// <param name="stateId"></param>
-        /// <param name="mintermId"></param>
-        /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private bool IsNullableWithContext(int stateId, int mintermId) =>
             ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0;
@@ -243,6 +240,7 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                         return (false, concatNode);
                 }
             });
+
             while (canLoop)
             {
 #if DEBUG
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 76614272556f6..1bdc128a34899 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -233,8 +233,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
                         _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength,
                     };
                 });
-                // a DFA is sometimes 10x-100x faster than the optimizations
-                // the "IsUseful" is harming the engine here
+
+                // In some cases where the findOptimizations are useful, just using the DFA can still be faster.
                 _findOpts = findOptimizations switch
                 {
                     { FindMode: FindNextStartingPositionMode.FixedDistanceString_LeftToRight } => findOptimizations,
@@ -242,7 +242,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
                         findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke)? null : findOptimizations,
                     { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } => setIsTooCommon(
                         findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations,
-                    _ => findOptimizations // TODO: unsure which options are left here
+                    _ => findOptimizations
                 };
                 // _findOpts = findOptimizations;
                 // _findOpts = null;
@@ -349,8 +349,6 @@ uint CalculateMintermIdKind(int mintermId)
         /// </summary>
         internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);
 
-        /// TODO: when you're calling a function millions of times per second even this add 1 does cost something
-        /// this should be ideally remapped
         /// <summary>Look up what is the character kind given a position ID</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];

From f581755eeef6d0287c63f94c6e5fda112d32a9da Mon Sep 17 00:00:00 2001
From: ieviev <36763595+ieviev@users.noreply.github.com>
Date: Thu, 27 Jun 2024 14:39:33 +0300
Subject: [PATCH 25/63] Apply suggestions from code review

Co-authored-by: Stephen Toub <stoub@microsoft.com>
---
 .../src/System.Text.RegularExpressions.csproj |  3 ---
 .../Symbolic/MatchingState.cs                 |  2 +-
 .../Symbolic/MintermClassifier.cs             | 16 +++++++-------
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  9 --------
 .../Symbolic/SymbolicRegexMatcher.cs          | 21 ++++++-------------
 5 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index b3fda5f2f4326..5ec4d230d7ba5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -5,9 +5,6 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <DefineConstants>$(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS</DefineConstants>
     <UseCompilerGeneratedDocXmlFile>false</UseCompilerGeneratedDocXmlFile>
-<!--   <NoWarn>IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649;</NoWarn>-->
-<!--&lt;!&ndash;   documentation warnings &ndash;&gt;-->
-<!--   <NoWarn>$(NoWarn);CS1574</NoWarn>-->
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 1622107e8d9ce..41251ccc82c83 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -154,7 +154,7 @@ internal StateFlags BuildStateFlags(bool isInitial)
         }
 
         /// <summary>
-        /// Builds the nullability information for the matching statie.
+        /// Builds the nullability information for the matching state.
         /// Nullability for each context is encoded in a bit
         /// 0 means node cannot be nullable
         /// 00001 -> nullable for General
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 3ee908ffd0f06..fc8a1abdd55d9 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -27,10 +27,9 @@ internal sealed class MintermClassifier
         /// <summary>Conserve memory if pattern is ascii-only</summary>
         private readonly bool _isAsciiOnly;
 
-        // /// <summary>
-        // /// fallback lookup if over 255 minterms
-        // /// this is almost never used
-        // /// </summary>
+        /// <summary>
+        /// Fallback lookup if over 255 minterms. This is rarely used.
+        /// </summary>
         private readonly int[]? _intLookup;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
@@ -116,17 +115,16 @@ public int GetMintermID(int c)
         public bool IsAsciiOnly() => _isAsciiOnly;
 
         /// <summary>
-        /// Quick mapping from char to minterm,
-        /// can be null if there is over 255 minterms
+        /// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
+        /// Null if there are greater than 255 minterms.
         /// </summary>
-        /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public byte[]? ByteLookup() => _lookup;
 
         /// <summary>
-        /// Int lookup for rare cases
+        /// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
+        /// Null in the common case where there are fewer than 255 minterms.
         /// </summary>
-        /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int[]? IntLookup() => _intLookup;
     }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 2a573d9eee285..0925738c9a41e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -59,15 +59,6 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// </summary>
         private bool[] _canBeAcceleratedArray;
 
-#if DEBUG
-        // private readonly Action<string> _wout = st =>
-        // {
-        //     var a_cons = System.Reflection.Assembly.Load("System.Console");
-        //     var t_cons = a_cons.GetType("System.Console")!;
-        //     var wl = t_cons.GetMethod("WriteLine", [typeof(string)]);
-        //     wl!.Invoke(null, [st]);
-        // };
-#endif
         /// <summary>
         /// The transition function for DFA mode.
         /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 1bdc128a34899..0e285be987fec 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -218,9 +218,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
 
                 var setIsTooCommon = new Func<RegexFindOptimizations.FixedDistanceSet, bool>((fds) =>
                 {
-                    // _wout($"rn{fds.Range is null}");
-                    // _wout($"cn{fds.Chars is null}");
-                    // _wout($"cc{fds.Chars!.Length}");
                     return fds switch
                     {
                         { Chars: not null } =>
@@ -237,18 +234,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
                 // In some cases where the findOptimizations are useful, just using the DFA can still be faster.
                 _findOpts = findOptimizations switch
                 {
-                    { FindMode: FindNextStartingPositionMode.FixedDistanceString_LeftToRight } => findOptimizations,
-                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } =>
-                        findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke)? null : findOptimizations,
-                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } => setIsTooCommon(
-                        findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations,
+                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when  findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke) => null,
+                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when setIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null,
                     _ => findOptimizations
                 };
-                // _findOpts = findOptimizations;
-                // _findOpts = null;
-                // _wout($"{findOptimizations.FindMode}");
-                // _wout($"{findOptimizations.FixedDistanceSets![0]}");
-                // _wout($"o{_findOpts}");
             }
 
             // Determine the number of initial states. If there's no anchor, only the default previous
@@ -485,6 +474,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                 case MatchReversalKind.FixedLength:
                     matchStart = (matchEnd - _optimizedReversalState.FixedLength);
                     break;
+
                 case MatchReversalKind.MatchStart:
                 case MatchReversalKind.PartialFixedLength:
                     int initialLastStart = -1; // invalid sentinel value
@@ -581,6 +571,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                         FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
                             FullNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
                             ref initialStatePosCandidate, ref initialStatePosCandidate);
+
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
                 if (done || pos >= input.Length)
@@ -1856,6 +1847,7 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
             {
                 if (currentStateId != initialStateId)
                     return false;
+
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     return true;
@@ -1878,6 +1870,7 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
             {
                 if (currentStateId != initialStateId)
                     return false;
+
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     currentStateId = matcher._dotstarredInitialStates[
@@ -1975,8 +1968,6 @@ public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matche
                 where TStateHandler : struct, IStateHandler
             {
                 return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
-                // cannot be used in NFA mode
-                // return matcher.IsNullableWithContext(state.DfaStateId, positionId);
             }
         }
     }

From 01a9684d65fea316ef062a2c10978939e6fd900f Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:20:59 +0300
Subject: [PATCH 26/63] rebased branch and some cleanup

---
 .../src/System.Text.RegularExpressions.csproj             | 4 ++--
 .../Text/RegularExpressions/Symbolic/MatchReversal.cs     | 2 +-
 .../Text/RegularExpressions/Symbolic/MatchingState.cs     | 8 ++------
 .../System/Text/RegularExpressions/Symbolic/StateFlags.cs | 5 -----
 4 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 5ec4d230d7ba5..86353b31b5d7b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -72,6 +72,8 @@
     <Compile Include="System\Text\RegularExpressions\Symbolic\MatchingState.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\ISolver.cs" />
+    <Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversalKind.cs"/>
+    <Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversal.cs"/>
     <Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\MintermGenerator.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\RegexNodeConverter.cs" />
@@ -93,8 +95,6 @@
     <Compile Include="System\Text\RegularExpressions\Symbolic\UnicodeCategoryConditions.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\UnicodeCategoryRanges.cs" />
     <Compile Include="System\Text\RegularExpressions\Symbolic\UnicodeCategoryRangesGenerator.cs" />
-    <Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversalKind.cs"/>
-    <Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversal.cs"/>
     <!-- Common or Common-branched source files -->
     <Compile Include="$(CommonPath)System\HexConverter.cs" Link="Common\System\HexConverter.cs" />
     <Compile Include="$(CommonPath)System\Obsoletions.cs" Link="Common\System\Obsoletions.cs" />
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
index 215aa65a1d14f..cd00755dbe6dc 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
@@ -3,7 +3,7 @@
 
 namespace System.Text.RegularExpressions.Symbolic;
 
-internal sealed class MatchReversal<TSet>(
+internal readonly struct MatchReversal<TSet>(
     MatchReversalKind kind,
     int fixedLength,
     MatchingState<TSet>? adjustedStartState = null)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 41251ccc82c83..55032b39d9bb1 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -17,9 +17,6 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
             NullabilityInfo = BuildNullabilityInfo();
         }
 
-        /// <summary>
-        /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor
-        /// </summary>
         internal int NullabilityInfo { get; }
 
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
@@ -102,8 +99,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
         }
 
         /// <summary>
-        /// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned
-        /// Use <see cref="SymbolicRegexMatcher{TSet}.IsNullableWithContext"/>
+        /// Cached nullability check with encoded bits
         /// whereever possible
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -170,7 +166,7 @@ internal byte BuildNullabilityInfo()
             {
                 for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
                 {
-                    nullabilityInfo |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0);
+                    nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0);
                 }
             }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
index 990eb4807c7f1..c1628ebbcf312 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
@@ -28,18 +28,13 @@ internal enum StateFlags : byte
     /// </summary>
     internal static class StateFlagsExtensions
     {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None;
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None;
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None;
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None;
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != StateFlags.None;
     }
 }

From 341ce27f1ce66c17916b22ef40ec94f433d77129 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:40:59 +0300
Subject: [PATCH 27/63] cleanup, removing unused features

---
 .../System/Text/RegularExpressions/Symbolic/StateFlags.cs  | 5 -----
 .../Symbolic/SymbolicRegexMatcher.Automata.cs              | 6 ------
 .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs    | 7 ++-----
 3 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
index c1628ebbcf312..a342aff09b6b8 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
@@ -20,7 +20,6 @@ internal enum StateFlags : byte
         IsNullableFlag = 4,
         CanBeNullableFlag = 8,
         SimulatesBacktrackingFlag = 16,
-        IsAcceleratedFlag = 32,
     }
 
     /// <summary>
@@ -30,11 +29,7 @@ internal static class StateFlagsExtensions
     {
         internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None;
         internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None;
-
         internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None;
-
         internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None;
-
-        internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != StateFlags.None;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 0925738c9a41e..65bd8834b6508 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -54,10 +54,6 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// </summary>
         private byte[] _nullabilityArray;
 
-        /// <summary>
-        /// Used to short-circuit accelerated states in the hot loop
-        /// </summary>
-        private bool[] _canBeAcceleratedArray;
 
         /// <summary>
         /// The transition function for DFA mode.
@@ -301,12 +297,10 @@ private MatchingState<TSet> GetOrCreateState_NoLock(SymbolicRegexNode<TSet> node
                     ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog);
                     ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize);
                     ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize);
-                    ArrayResizeAndVolatilePublish(ref _canBeAcceleratedArray, newsize);
                 }
                 _stateArray[state.Id] = state;
                 _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState);
                 _nullabilityArray[state.Id] = state.BuildNullabilityInfo();
-                _canBeAcceleratedArray[state.Id] = _stateFlagsArray[state.Id].IsAccelerated();
             }
 
             return state;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 0e285be987fec..3efb0d7db75ef 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -191,7 +191,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
             _stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
             _stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
             _nullabilityArray = new byte[InitialDfaStateCapacity];
-            _canBeAcceleratedArray = new bool[InitialDfaStateCapacity];
             _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog];
 
             // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm
@@ -361,7 +360,6 @@ internal TSet GetMintermFromId(int mintermId)
             return minterms[mintermId];
         }
 
-        /// <summary>TODO: this if-else branch could be called once. it's currently causing overhead on every single step</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private uint GetCharKind<TInputReader>(ReadOnlySpan<char> input, int i)
             where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ?
@@ -871,7 +869,6 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
 
 
         /// <summary>
-        /// TODO: this is a separate DFA function that takes advantage of short circuit array lookups
         /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
         /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
@@ -901,6 +898,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
             int initialStatePos = initialStatePosRef;
             int initialStatePosCandidate = initialStatePosCandidateRef;
             int deadStateId = _deadStateId;
+            int initialStateId = _initialStateId;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
@@ -910,7 +908,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     {
                         return true;
                     }
-                    if ((_stateFlagsArray[state.DfaStateId] & StateFlags.IsAcceleratedFlag) != 0)
+                    if (state.DfaStateId == initialStateId)
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
                         {
@@ -1866,7 +1864,6 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 byte[] lookup,
                 ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
-
             {
                 if (currentStateId != initialStateId)
                     return false;

From 1a28c69f925d72f9fe8837ef58ae47fe41b2e13b Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 27 Jun 2024 19:36:02 +0300
Subject: [PATCH 28/63] cleanup

---
 .../Symbolic/SymbolicRegexMatcher.cs          | 95 +------------------
 1 file changed, 2 insertions(+), 93 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 3efb0d7db75ef..61d3bd99cccec 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -564,11 +564,13 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                             currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
                             ref initialStatePosCandidate);
                 else
+                {
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
                     done =
                         FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
                             FullNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
                             ref initialStatePosCandidate, ref initialStatePosCandidate);
+                }
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
@@ -668,99 +670,6 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
             return endPos;
         }
 
-        /// <summary>
-        /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
-        /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
-        /// lazily building out the graph as needed.
-        /// </summary>
-        /// <remarks>
-        /// The <typeparamref name="TStateHandler"/> supplies the actual transitioning logic, controlling whether processing is
-        /// performed in DFA mode or in NFA mode.  However, it expects <paramref name="state"/> to be configured to match,
-        /// so for example if <typeparamref name="TStateHandler"/> is a <see cref="DfaStateHandler"/>, it expects the <paramref name="state"/>'s
-        /// <see cref="CurrentState.DfaStateId"/> to be non-negative and its <see cref="CurrentState.NfaState"/> to be null; vice versa for
-        /// <see cref="NfaStateHandler"/>.
-        /// </remarks>
-        /// <returns>
-        /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch.
-        /// 0 if iteration completed because we reached an initial state.
-        /// A negative value if iteration completed because we ran out of input or we failed to transition.
-        /// </returns>
-        private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
-            where TStateHandler : struct, IStateHandler
-            where TInputReader : struct, IInputReader
-            where TFindOptimizationsHandler : struct, IInitialStateHandler
-            where TNullabilityHandler : struct, INullabilityHandler
-        {
-            // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
-            int pos = posRef;
-            int endPos = endPosRef;
-            // int endStateId = endStateIdRef;
-            int initialStatePos = initialStatePosRef;
-            int initialStatePosCandidate = initialStatePosCandidateRef;
-            try
-            {
-                // Loop through each character in the input, transitioning from state to state for each.
-                while (true)
-                {
-                    StateFlags flags = TStateHandler.GetStateFlags(this, in state);
-
-                    // Check if currentState represents an initial state. If it does, call into any possible find optimizations
-                    // to hopefully more quickly find the next possible starting location.
-                    if (flags.IsInitial())
-                    {
-                        if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
-                        {
-                            return true;
-                        }
-
-                        initialStatePosCandidate = pos;
-                    }
-
-                    // If the state is a dead end, such that we can't transition anywhere else, end the search.
-                    if (state.DfaStateId == _deadStateId)
-                    {
-                        return true;
-                    }
-
-                    int positionId = TInputReader.GetPositionId(this, input, pos);
-
-                    // If the state is nullable for the next character, meaning it accepts the empty string,
-                    // we found a potential end state.
-                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, flags))
-                    {
-                        endPos = pos;
-                        // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
-                        initialStatePos = initialStatePosCandidate;
-
-                        // A match is known to exist.  If that's all we need to know, we're done.
-                        if (mode == RegexRunnerMode.ExistenceRequired)
-                        {
-                            return true;
-                        }
-                    }
-
-                    // If there is more input available try to transition with the next character.
-                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
-                    {
-                        return false;
-                    }
-
-                    // We successfully transitioned, so update our current input index to match.
-                    pos++;
-                }
-            }
-            finally
-            {
-                // Write back the local copies of the ref values.
-                posRef = pos;
-                endPosRef = endPos;
-                // endStateIdRef = endStateId;
-                initialStatePosRef = initialStatePos;
-                initialStatePosCandidateRef = initialStatePosCandidate;
-            }
-        }
-
 
         /// <summary>
         /// tbd

From 9bba84fc2b8c0bcc8e1efe6ff2ae415833a94563 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 30 Jun 2024 01:04:05 +0300
Subject: [PATCH 29/63] timeout limit changes

---
 .../Symbolic/SymbolicRegexMatcher.cs                   | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 61d3bd99cccec..0643d2fe9750f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -549,20 +549,22 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
 
             while (true)
             {
-                const int CharsPerTimeoutCheck = 1_000;
-                // TODO: maybe this should be for NFA mode only
+                // TODO: this could be safely raised higher but 25k is about the limit where it contributes overhead
+                const int CharsPerTimeoutCheck = 25000;
                 int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
                     pos + CharsPerTimeoutCheck :
                     input.Length;
 
                 bool done;
                 if (currentState.NfaState is null)
+                {
                     done =
                         FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
                             TAcceleratedStateHandler,
                             TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
                             currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
                             ref initialStatePosCandidate);
+                }
                 else
                 {
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
@@ -629,7 +631,9 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 // catastrophic backtracking.  Catastrophic backtracking is not an issue for the NonBacktracking engine, but we
                 // still check the timeout now and again to provide some semblance of the behavior a developer experiences with
                 // the backtracking engines.  We can, however, choose a large number here, since it's not actually needed for security.
-                const int CharsPerTimeoutCheck = 1_000;
+                // todo: the reason why this is lower than FindEndPositionOptimized is an arbitrary choice, but 255+ minterms and NFA mode may
+                // reach speeds low enough for this to be relevant
+                const int CharsPerTimeoutCheck = 5_000;
                 int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
                     pos + CharsPerTimeoutCheck :
                     input.Length;

From a9577815b8b0a50f46de7bfd01aea8f9110cab5e Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 30 Jun 2024 15:52:40 +0300
Subject: [PATCH 30/63] lookup allocation threshold and timeout limits

---
 .../Symbolic/MintermClassifier.cs             | 55 +++++++++-------
 .../Symbolic/SymbolicRegexMatcher.cs          | 66 +++++++++++--------
 .../FunctionalTests/Regex.Match.Tests.cs      |  2 +-
 3 files changed, 72 insertions(+), 51 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index fc8a1abdd55d9..88f3de35b23f9 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -20,18 +20,20 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </remarks>
     internal sealed class MintermClassifier
     {
-        private static readonly byte[] s_emptyLookup = new byte[ushort.MaxValue + 1];
         /// <summary>An array used to map characters to minterms</summary>
         private readonly byte[]? _lookup;
 
-        /// <summary>Conserve memory if pattern is ascii-only</summary>
-        private readonly bool _isAsciiOnly;
-
         /// <summary>
         /// Fallback lookup if over 255 minterms. This is rarely used.
         /// </summary>
         private readonly int[]? _intLookup;
 
+
+        /// <summary>
+        /// Maximum ordinal character for a non-0 minterm, used to conserve memory
+        /// </summary>
+        private readonly int _maxChar;
+
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
         public MintermClassifier(BDD[] minterms)
@@ -42,18 +44,24 @@ public MintermClassifier(BDD[] minterms)
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
-                _lookup = s_emptyLookup;
+                _lookup = Array.Empty<byte>();
                 return;
             }
 
-            // ascii-only array to save memory
-            _isAsciiOnly = true;
+            // attempt to save memory in common cases by allocating only up to the highest char code
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
-                if (BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2 >= 128)
-                {
-                    _isAsciiOnly = false;
-                }
+                _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
+            }
+            // increment by 1 to fit the highest character code in the 0-based array as well
+            _maxChar += 1;
+
+            // the trade-off is somewhere around 5% performance for a higher initial allocation.
+            // past a certain threshold where the maxChar is already large,
+            // the full 64k can be allocated and OptimizedFullInputReader can be used
+            if (_maxChar > 32_000)
+            {
+                _maxChar = ushort.MaxValue + 1;
             }
 
             // It's incredibly rare for a regex to use more than a hundred or two minterms,
@@ -61,7 +69,7 @@ public MintermClassifier(BDD[] minterms)
             if (minterms.Length > 255)
             {
                 // over 255 unique sets also means it's never ascii only
-                int[] lookup = new int[ushort.MaxValue + 1];
+                int[] lookup = new int[_maxChar];
                 for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
                     // precompute all assigned minterm categories
@@ -77,7 +85,7 @@ public MintermClassifier(BDD[] minterms)
             }
             else
             {
-                byte[] lookup = new byte[_isAsciiOnly ? 128 : ushort.MaxValue + 1];
+                byte[] lookup = new byte[_maxChar];
                 for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
                     // precompute all assigned minterm categories
@@ -97,23 +105,14 @@ public MintermClassifier(BDD[] minterms)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
-            if (_isAsciiOnly && (c >= 128))
+            if (c > _maxChar)
             {
                 return 0;
             }
 
-            // high performance variant would use a span directly.
-            // additional memory is saved by using a byte
+            // high performance inner-loop variant uses the array directly
             return _intLookup is null ? _lookup![c] : _intLookup[c];
         }
-
-        /// <summary>
-        /// Whether to use the low memory ascii-only hot loop or the full loop
-        /// </summary>
-        /// <returns></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public bool IsAsciiOnly() => _isAsciiOnly;
-
         /// <summary>
         /// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
         /// Null if there are greater than 255 minterms.
@@ -127,5 +126,13 @@ public int GetMintermID(int c)
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int[]? IntLookup() => _intLookup;
+
+        /// <summary>
+        /// Whether the full 64K char lookup is allocated.
+        /// This accelerates the minterm mapping by removing an if-else case,
+        /// and is only considered for the common &lt;= 255 minterms case
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public bool IsFullLookup() => _lookup is not null && _lookup.Length == ushort.MaxValue + 1;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 0643d2fe9750f..18e2c256e0854 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -404,31 +404,31 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases
             if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null))
             {
-                matchEnd = (_mintermClassifier.IsAsciiOnly(), _findOpts is not null, _containsAnyAnchor) switch
+                matchEnd = (_mintermClassifier.IsFullLookup(), _findOpts is not null, _containsAnyAnchor) switch
                 {
-                    (true, true, true) =>
-                        FindEndPositionOptimized<OptimizedAsciiInputReader, AcceleratedStateHandler,
+                    (false, true, true) =>
+                        FindEndPositionOptimized<OptimizedSmallInputReader, AcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, true, false) =>
-                        FindEndPositionOptimized<OptimizedAsciiInputReader, NoAnchorAcceleratedStateHandler,
+                    (false, true, false) =>
+                        FindEndPositionOptimized<OptimizedSmallInputReader, NoAnchorAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, false, false) =>
-                        FindEndPositionOptimized<OptimizedAsciiInputReader, NoAcceleratedStateHandler,
+                    (false, false, false) =>
+                        FindEndPositionOptimized<OptimizedSmallInputReader, NoAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, false, true) =>
-                        FindEndPositionOptimized<OptimizedAsciiInputReader, NoAcceleratedStateHandler,
+                    (false, false, true) =>
+                        FindEndPositionOptimized<OptimizedSmallInputReader, NoAcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, true, false) =>
-                        FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAnchorAcceleratedStateHandler,
+                    (true, true, false) =>
+                        FindEndPositionOptimized<OptimizedFullInputReader, NoAnchorAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, true, true) =>
-                        FindEndPositionOptimized<OptimizedUnicodeInputReader, AcceleratedStateHandler,
+                    (true, true, true) =>
+                        FindEndPositionOptimized<OptimizedFullInputReader, AcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false, false) =>
-                        FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAcceleratedStateHandler,
+                    (true, false, false) =>
+                        FindEndPositionOptimized<OptimizedFullInputReader, NoAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false, true) =>
-                        FindEndPositionOptimized<OptimizedUnicodeInputReader, NoAcceleratedStateHandler,
+                    (true, false, true) =>
+                        FindEndPositionOptimized<OptimizedFullInputReader, NoAcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                 };
             }
@@ -704,6 +704,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
             int currStateId = startStateId;
             int deadStateId = _deadStateId;
             int initialStateId = _initialStateId;
+            int maxChar = mtlookup.Length;
             try
             {
                 // The goal is to make this loop as fast as it can possibly be,
@@ -747,7 +748,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(
-                    this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, input, pos))
+                    this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos))
                         || pos >= lengthMinus1)
                     {
                         if (pos + 1 < input.Length)
@@ -1633,26 +1634,39 @@ public static void UndoTransition(ref CurrentState state)
 #endif
         }
 
+        /// <summary>
+        /// This input reader attempts to minimize overhead
+        /// by handling constraints outside of the loop:
+        /// 1. the position must be already valid for the input.
+        /// 2. the pattern must not to contain \Z.
+        /// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm
+        /// </summary>
         private interface IOptimizedInputReader
         {
-            public static abstract int GetPositionId(byte[] lookup, ReadOnlySpan<char> input,
+            public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input,
                 int pos);
         }
 
-        private readonly struct OptimizedAsciiInputReader : IOptimizedInputReader
+        /// <summary>
+        /// This reader maps all characters > maxChar to 0
+        /// </summary>
+        private readonly struct OptimizedSmallInputReader : IOptimizedInputReader
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos)
+            public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
             {
                 Debug.Assert(pos < input.Length);
-                return input[pos] >= 128 ? 0 : lookup[input[pos]];
+                return input[pos] > maxChar ? 0 : lookup[input[pos]];
             }
         }
 
-        private readonly struct OptimizedUnicodeInputReader : IOptimizedInputReader
+        /// <summary>
+        /// This reader is effectively an array lookup for the full 64k utf16 code unit mapping
+        /// </summary>
+        private readonly struct OptimizedFullInputReader : IOptimizedInputReader
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static int GetPositionId(byte[] lookup, ReadOnlySpan<char> input, int pos)
+            public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
             {
                 Debug.Assert(pos < input.Length);
                 Debug.Assert(lookup.Length == (ushort.MaxValue + 1));
@@ -1686,7 +1700,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}");
-                return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, input, pos));
+                return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos));
             }
         }
 
@@ -1784,7 +1798,7 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     currentStateId = matcher._dotstarredInitialStates[
-                        matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1]
+                        matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1]
                     ].Id;
                     return true;
                 }
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 57780531253d3..94ef063f1c079 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck()
         {
             // This constant must be at least as large as the one in the implementation that sets the maximum number
             // of innermost loop iterations between timeout checks.
-            const int CharsToTriggerTimeoutCheck = 10000;
+            const int CharsToTriggerTimeoutCheck = 25000;
             // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger.
             Assert.Throws<RegexMatchTimeoutException>(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1))
                 .Match(new string('a', CharsToTriggerTimeoutCheck)));

From 7e86855a92b880a5b30b6be35c1b7007c130ba53 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 30 Jun 2024 18:16:49 +0300
Subject: [PATCH 31/63] char mapping

---
 .../Symbolic/MintermClassifier.cs                | 16 ++++++++++------
 .../Symbolic/SymbolicRegexMatcher.cs             |  5 +++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 88f3de35b23f9..029ee00f9d8a0 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -31,6 +31,7 @@ internal sealed class MintermClassifier
 
         /// <summary>
         /// Maximum ordinal character for a non-0 minterm, used to conserve memory
+        /// Note: this is maximum index allowed for the lookup, the array size is _maxChar + 1
         /// </summary>
         private readonly int _maxChar;
 
@@ -53,15 +54,12 @@ public MintermClassifier(BDD[] minterms)
             {
                 _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
             }
-            // increment by 1 to fit the highest character code in the 0-based array as well
-            _maxChar += 1;
-
             // the trade-off is somewhere around 5% performance for a higher initial allocation.
             // past a certain threshold where the maxChar is already large,
             // the full 64k can be allocated and OptimizedFullInputReader can be used
             if (_maxChar > 32_000)
             {
-                _maxChar = ushort.MaxValue + 1;
+                _maxChar = ushort.MaxValue;
             }
 
             // It's incredibly rare for a regex to use more than a hundred or two minterms,
@@ -69,7 +67,7 @@ public MintermClassifier(BDD[] minterms)
             if (minterms.Length > 255)
             {
                 // over 255 unique sets also means it's never ascii only
-                int[] lookup = new int[_maxChar];
+                int[] lookup = new int[_maxChar + 1];
                 for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
                     // precompute all assigned minterm categories
@@ -85,7 +83,7 @@ public MintermClassifier(BDD[] minterms)
             }
             else
             {
-                byte[] lookup = new byte[_maxChar];
+                byte[] lookup = new byte[_maxChar + 1];
                 for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
                     // precompute all assigned minterm categories
@@ -134,5 +132,11 @@ public int GetMintermID(int c)
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public bool IsFullLookup() => _lookup is not null && _lookup.Length == ushort.MaxValue + 1;
+
+        /// <summary>
+        /// Maximum ordinal character for a non-0 minterm, used to conserve memory
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public int MaxChar() => _maxChar;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 18e2c256e0854..34ddab8f63533 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -704,7 +704,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
             int currStateId = startStateId;
             int deadStateId = _deadStateId;
             int initialStateId = _initialStateId;
-            int maxChar = mtlookup.Length;
+            int maxChar = _mintermClassifier.MaxChar();
             try
             {
                 // The goal is to make this loop as fast as it can possibly be,
@@ -1655,7 +1655,8 @@ public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpa
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
             {
-                Debug.Assert(pos < input.Length);
+                Debug.Assert(pos < input.Length, "pos < input.Length");
+                Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}");
                 return input[pos] > maxChar ? 0 : lookup[input[pos]];
             }
         }

From 99b5717e61f0f7ee2e22597ae5ac69defe689dd4 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 30 Jun 2024 20:09:36 +0300
Subject: [PATCH 32/63] empty array mapping

---
 .../System/Text/RegularExpressions/Symbolic/MintermClassifier.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 029ee00f9d8a0..12aaf3ce02c60 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -46,6 +46,7 @@ public MintermClassifier(BDD[] minterms)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
                 _lookup = Array.Empty<byte>();
+                _maxChar = -1;
                 return;
             }
 

From 47c6b0477cfb55cfd9ddf1b2a4ae81bd742fdba9 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 30 Jun 2024 23:27:51 +0300
Subject: [PATCH 33/63] adding timeout check to create-derivative

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 17 ++--
 .../Symbolic/SymbolicRegexMatcher.cs          | 94 ++++++++++++-------
 .../Symbolic/SymbolicRegexThresholds.cs       | 15 ++-
 3 files changed, 77 insertions(+), 49 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 65bd8834b6508..fd16805c7d455 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -230,12 +230,6 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
 
             while (canLoop)
             {
-#if DEBUG
-                // if (current._left is null)
-                //     _wout($"NULL {current._kind}");
-                // else
-                //     _wout($"{pos} {current._kind} l:{current._left!._kind} {current}");
-#endif
                 (bool loop, SymbolicRegexNode<TSet> next) = current switch
                 {
                     // This could potentially be a very good future optimization for
@@ -386,16 +380,23 @@ private int GetCoreStateId(int nfaStateId)
         /// <summary>Gets or creates a new DFA transition.</summary>
         /// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
         private bool TryCreateNewTransition(
-            MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState<TSet>? nextState)
+            MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState<TSet>? nextState,
+            long timeoutOccursAt = 0)
         {
             Debug.Assert(offset < _dfaDelta.Length);
-
             lock (this)
             {
                 // check if meanwhile delta[offset] has become defined possibly by another thread
                 MatchingState<TSet>? targetState = _stateArray[_dfaDelta[offset]];
                 if (targetState is null)
                 {
+                    // check if there is an active timer
+                    if (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt)
+                    {
+                        nextState = null;
+                        return false;
+                    }
+
                     if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
                     {
                         nextState = null;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 34ddab8f63533..500221782b363 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -538,7 +538,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             }
         }
 
-        private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
+        private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
+            ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
             where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
@@ -549,28 +550,32 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
 
             while (true)
             {
-                // TODO: this could be safely raised higher but 25k is about the limit where it contributes overhead
-                const int CharsPerTimeoutCheck = 25000;
-                int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
-                    pos + CharsPerTimeoutCheck :
-                    input.Length;
-
+                int innerLoopLength;
                 bool done;
                 if (currentState.NfaState is null)
                 {
+                    const int dfaCharsPerTimeoutCheck = 100000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck
+                        ? pos + dfaCharsPerTimeoutCheck
+                        : input.Length;
                     done =
                         FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
                             TAcceleratedStateHandler,
                             TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
                             currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
-                            ref initialStatePosCandidate);
+                            ref initialStatePosCandidate, timeoutOccursAt);
                 }
                 else
                 {
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
+                    // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
+                    const int nfaCharsPerTimeoutCheck = 1000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck
+                        ? pos + nfaCharsPerTimeoutCheck
+                        : input.Length;
                     done =
                         FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
-                            FullNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
+                            FullNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
                             ref initialStatePosCandidate, ref initialStatePosCandidate);
                 }
 
@@ -631,16 +636,34 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 // catastrophic backtracking.  Catastrophic backtracking is not an issue for the NonBacktracking engine, but we
                 // still check the timeout now and again to provide some semblance of the behavior a developer experiences with
                 // the backtracking engines.  We can, however, choose a large number here, since it's not actually needed for security.
-                // todo: the reason why this is lower than FindEndPositionOptimized is an arbitrary choice, but 255+ minterms and NFA mode may
                 // reach speeds low enough for this to be relevant
-                const int CharsPerTimeoutCheck = 5_000;
-                int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
-                    pos + CharsPerTimeoutCheck :
-                    input.Length;
-
-                bool done = currentState.NfaState is not null ?
-                    FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate) :
-                    FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate);
+                // The fallback function has lower limits due to possibly worse performance
+                int innerLoopLength;
+                bool done;
+                if (currentState.NfaState is null)
+                {
+                    const int dfaCharsPerTimeoutCheck = 25000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck
+                        ? pos + dfaCharsPerTimeoutCheck
+                        : input.Length;
+                    done =
+                        FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
+                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
+                            ref endStateId, ref initialStatePosCandidate, timeoutOccursAt);
+                }
+                else
+                {
+                    // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
+                    // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
+                    const int nfaCharsPerTimeoutCheck = 1000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck
+                        ? pos + nfaCharsPerTimeoutCheck
+                        : input.Length;
+                    done =
+                        FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler,
+                            TNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
+                            ref endStateId, ref initialStatePosCandidate);
+                }
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
                 // there is no more input available, then the whole search is done.
@@ -680,7 +703,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
         /// </summary>
         private bool FindEndPositionDeltasDFAOptimized<TOptimizedInputReader, TAcceleratedStateHandler,
             TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
-                ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+                ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, long timeoutOccursAt)
             where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
@@ -748,7 +771,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(
-                    this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos))
+                    this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos),
+                    timeoutOccursAt)
                         || pos >= lengthMinus1)
                     {
                         if (pos + 1 < input.Length)
@@ -800,7 +824,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
         private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef,
+                long timeoutOccursAt)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -851,7 +876,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
                     // If there is more input available try to transition with the next character.
                     if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state,
-                            positionId))
+                            positionId, timeoutOccursAt))
                     {
                         return false;
                     }
@@ -872,7 +897,6 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
         }
 
         /// <summary>
-        /// TODO: this is the fallback NFA function
         /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
         /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
@@ -889,7 +913,8 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
         /// 0 if iteration completed because we reached an initial state.
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
-        private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
+        private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(
+                ReadOnlySpan<char> input, int length, RegexRunnerMode mode, long timeoutOccursAt,
                 ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
@@ -931,7 +956,7 @@ private bool FindEndPositionDeltasNFA<TStateHandler, TInputReader, TFindOptimiza
                     }
 
                     // If there is more input available try to transition with the next character.
-                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt))
                     {
                         return false;
                     }
@@ -1007,7 +1032,8 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(CurrentState st
         /// starting at <paramref name="i"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,
         /// lazily building out the graph as needed.
         /// </summary>
-        private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
+        private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilityHandler>(
+            ReadOnlySpan<char> input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TNullabilityHandler : struct, INullabilityHandler
@@ -1037,7 +1063,7 @@ private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilit
                     }
 
                     // Try to transition with the next character, the one before the current position.
-                    if (!TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    if (!TStateHandler.TryTakeTransition(this, ref state, positionId, 0))
                     {
                         // Return false to indicate the search didn't finish.
                         return false;
@@ -1085,7 +1111,7 @@ private bool FindStartPositionDeltasNFA<TStateHandler, TInputReader, TNullabilit
                     }
 
                     // Try to transition with the next character, the one before the current position.
-                    if (!TStateHandler.TryTakeTransition(this, ref state, positionId))
+                    if (!TStateHandler.TryTakeTransition(this, ref state, positionId, 0))
                     {
                         // Return false to indicate the search didn't finish.
                         return false;
@@ -1356,7 +1382,7 @@ private interface IStateHandler
             public static abstract bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind);
             public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, ReadOnlySpan<char> input, int pos);
             public static abstract int FixedLength(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, uint nextCharKind);
-            public static abstract bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId);
+            public static abstract bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId, long timeoutOccursAt);
             public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state);
         }
 
@@ -1380,7 +1406,8 @@ public static bool IsNullableFor(SymbolicRegexMatcher<TSet> matcher, in CurrentS
 
             /// <summary>Take the transition to the next DFA state.</summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId)
+            public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId,
+                long timeoutOccursAt)
             {
                 Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}.");
                 Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}.");
@@ -1412,7 +1439,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref Cur
             /// <summary>Take the transition to the next DFA state without paying for the NFA structure</summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
-                int mintermId)
+                int mintermId, long timeoutOccursAt)
             {
                 Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}.");
                 // Use the mintermId for the character being read to look up which state to transition to.
@@ -1429,7 +1456,7 @@ public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref
 
                 if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId,
                         matcher.DeltaOffset(state, mintermId),
-                        checkThreshold: true, out MatchingState<TSet>? nextState))
+                        checkThreshold: true, out MatchingState<TSet>? nextState, timeoutOccursAt))
                 {
                     // We were able to create a new DFA transition to some state. Move to it and
                     // return that we're still operating as a DFA and can keep going.
@@ -1516,7 +1543,8 @@ public static int FixedLength(SymbolicRegexMatcher<TSet> matcher, in CurrentStat
             }
 
             /// <summary>Take the transition to the next NFA state.</summary>
-            public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId)
+            public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref CurrentState state, int mintermId,
+                long timeoutOccursAt = 0)
             {
                 Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}.");
                 Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}.");
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index d455f26da1dcf..f26009f035a57 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -22,25 +22,24 @@ internal static class SymbolicRegexThresholds
         /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex.
         /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing
         /// to create a new node and the graph is already or newly beyond this threshold.
-        /// TODO: summarize this
-        /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s
-        /// with an entirely different search-time algorithmic complexity
-        /// 100_000 isn't a really a high memory cost either,
-        /// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has
+        /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB;
+        /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially
+        /// significant search-time performance gains
         /// </remarks>
-        internal const int NfaThreshold = 100_000;
+        internal const int NfaThreshold = 25_000;
 
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
         /// after the AST has been anlayzed for safe handling.
-        /// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too
         /// <remarks>
         /// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
         /// This default value may be overridden with the AppContext data
         /// whose name is given by  <see cref="SymbolicRegexSafeSizeThreshold_ConfigKeyName"/>.
         /// </remarks>
+        /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s,
+        /// although it could be safely raised higher at the expense of worst-case NFA performance
         /// </summary>
-        internal const int DefaultSymbolicRegexSafeSizeThreshold = 1000;
+        internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; // nfa speed constraint
 
         ///<summary>The environment variable name for a value overriding the default value <see cref="DefaultSymbolicRegexSafeSizeThreshold"/></summary>
         internal const string SymbolicRegexSafeSizeThreshold_ConfigKeyName = "REGEX_NONBACKTRACKING_MAX_AUTOMATA_SIZE";

From 22d23fad468dd00779fbccd90838787cfa5d7d71 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sun, 30 Jun 2024 23:57:17 +0300
Subject: [PATCH 34/63] some cleanup

---
 .../RegularExpressions/Symbolic/MintermClassifier.cs |  6 +++---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs        | 12 ++++--------
 .../Symbolic/SymbolicRegexThresholds.cs              |  2 +-
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 12aaf3ce02c60..323cc6cdcc316 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -55,9 +55,9 @@ public MintermClassifier(BDD[] minterms)
             {
                 _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
             }
-            // the trade-off is somewhere around 5% performance for a higher initial allocation.
-            // past a certain threshold where the maxChar is already large,
-            // the full 64k can be allocated and OptimizedFullInputReader can be used
+            // there is an opportunity to gain around 5% performance for allocating the
+            // full 64K, past a certain threshold where maxChar is already large.
+            // TODO: what should this threshold be?
             if (_maxChar > 32_000)
             {
                 _maxChar = ushort.MaxValue;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index fd16805c7d455..e5430249a86a8 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -390,14 +390,10 @@ private bool TryCreateNewTransition(
                 MatchingState<TSet>? targetState = _stateArray[_dfaDelta[offset]];
                 if (targetState is null)
                 {
-                    // check if there is an active timer
-                    if (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt)
-                    {
-                        nextState = null;
-                        return false;
-                    }
-
-                    if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
+                    if (// check if there is an active timer
+                        (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) ||
+                        // check if size exceeds the NFA threshold
+                        (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold))
                     {
                         nextState = null;
                         return false;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index f26009f035a57..5bcda9cfba731 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -39,7 +39,7 @@ internal static class SymbolicRegexThresholds
         /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s,
         /// although it could be safely raised higher at the expense of worst-case NFA performance
         /// </summary>
-        internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; // nfa speed constraint
+        internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000;
 
         ///<summary>The environment variable name for a value overriding the default value <see cref="DefaultSymbolicRegexSafeSizeThreshold"/></summary>
         internal const string SymbolicRegexSafeSizeThreshold_ConfigKeyName = "REGEX_NONBACKTRACKING_MAX_AUTOMATA_SIZE";

From 761f897bff33c3f0b94ecd800d6d9fa6b7e41b28 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 1 Jul 2024 01:48:29 +0300
Subject: [PATCH 35/63] comments and cleanup

---
 .../Symbolic/MatchingState.cs                 |  11 --
 .../Symbolic/SymbolicRegexInfo.cs             |   1 +
 .../Symbolic/SymbolicRegexMatcher.cs          | 108 ++++++++++++------
 3 files changed, 73 insertions(+), 47 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 55032b39d9bb1..405be0318bbd5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -100,7 +100,6 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
 
         /// <summary>
         /// Cached nullability check with encoded bits
-        /// whereever possible
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
@@ -108,16 +107,6 @@ internal bool IsNullableFor(uint nextCharKind)
             return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
         }
 
-        /// <summary>
-        /// Full nullability check for initialization
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal bool IsNullableForInit(uint nextCharKind)
-        {
-            Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
-            return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind));
-        }
-
         /// <summary>
         /// Builds a <see cref="StateFlags"/> with the relevant flags set.
         /// </summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
index 750fbed4774bf..b0aa0cd6e938d 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
@@ -55,6 +55,7 @@ private static SymbolicRegexInfo Create(
         public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
 
         public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
+
         public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
 
         public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 500221782b363..037796198ed70 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -211,31 +211,30 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
             if (findOptimizations.IsUseful &&
                 findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning)
             {
-                // this makes some assumptions about the frequency of occurrences
-                // some large sets like \p{Sm} are faster with infrequent matches but slower with frequent matches
-                // the easiest thing to do here is to leave it as-is, but this means some inputs can have large performance losses of 10x or more
-
-                var setIsTooCommon = new Func<RegexFindOptimizations.FixedDistanceSet, bool>((fds) =>
-                {
-                    return fds switch
-                    {
-                        { Chars: not null } =>
-                            // anything above 4 uint16 chars is generally slower than DFA
-                            fds.Negated ||
-                            (fds.Chars.Length > 4 &&
-                            Array.Exists(fds.Chars, char.IsAsciiLetterLower)),
-                        { Range: not null } => false,
-                        // for fixed length strings just trust the optimizations
-                        _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength,
-                    };
-                });
-
                 // In some cases where the findOptimizations are useful, just using the DFA can still be faster.
                 _findOpts = findOptimizations switch
                 {
-                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when  findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke) => null,
-                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when setIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null,
-                    _ => findOptimizations
+                    // for sets in fixed length patterns just trust the optimizations,
+                    // the performance can be either better or worse depending on frequency
+                    {
+                        FindMode:
+                        FindNextStartingPositionMode.FixedDistanceSets_LeftToRight or
+                        FindNextStartingPositionMode.LeadingSet_LeftToRight} when
+                        _optimizedReversalState.Kind != MatchReversalKind.FixedLength => findOptimizations,
+                    // string literals are the best case
+                    {
+                        FindMode:
+                        FindNextStartingPositionMode.LeadingString_LeftToRight or
+                        FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
+                        FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight
+                    } => findOptimizations,
+                    // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null
+                    { FindMode: FindNextStartingPositionMode.LeadingStrings_LeftToRight } => findOptimizations,
+                    { FindMode: FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight } => findOptimizations,
+                    // for singular character sets it depends if there's any reasonably small set to be accelerated
+                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => null,
+                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null,
+                    _ => null
                 };
             }
 
@@ -291,6 +290,36 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
             _reverseInitialStates = reverseInitialStates;
 
 
+            // TODO: this is still work in progress
+            // The frequency of occurrences makes a big difference here,
+            // anything above 4 uint16 chars is generally slower than DFA, but
+            // if the characters are very rare, then SearchValues can be up to ~2x faster
+            // SearchValues<char> implementations to avoid:
+            // - ProbabilisticCharSearchValues
+            // - ProbabilisticWithAsciiCharSearchValues`1
+            // - AsciiCharSearchValues`1
+            // - Any5SearchValues`2"
+            // SearchValues<string> implementations to avoid:
+            // - StringSearchValuesAhoCorasick`2
+            bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet)
+            {
+                return fixedDistanceSet switch
+                {
+                    // anything above 4 uint16 chars is generally slower than DFA
+                    { Chars: not null } =>
+                        // negated sets are usually large
+                        fixedDistanceSet.Negated ||
+                        (fixedDistanceSet.Chars.Length > 4
+                        // TODO: this extra condition is currently kept so there's no regressions
+                        // if ~500mb/s worst case is acceptable then this could be removed
+                        // but being able to guess which character sets are not too frequent can
+                        // often reach over 1gb/s with AVX
+                        && Array.Exists(fixedDistanceSet.Chars, char.IsAsciiLetterLower)),
+                    { Range: not null } => false,
+                    _ => false,
+                };
+            }
+
             // Maps a minterm ID to a character kind
             uint CalculateMintermIdKind(int mintermId)
             {
@@ -561,9 +590,9 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                     done =
                         FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
                             TAcceleratedStateHandler,
-                            TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, ref pos,
+                            TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, timeoutOccursAt, ref pos,
                             currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
-                            ref initialStatePosCandidate, timeoutOccursAt);
+                            ref initialStatePosCandidate);
                 }
                 else
                 {
@@ -648,8 +677,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                         : input.Length;
                     done =
                         FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos,
-                            ref endStateId, ref initialStatePosCandidate, timeoutOccursAt);
+                            TNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
+                            ref endStateId, ref initialStatePosCandidate);
                 }
                 else
                 {
@@ -699,11 +728,14 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
 
 
         /// <summary>
-        /// tbd
+        /// This version of <see cref="FindEndPositionDeltasDFA"/> uses a different set of interfaces,
+        /// which don't check for many inner loop edge cases e.g. input end or '\n'.
+        /// All edge cases are handled before entering the loop.
         /// </summary>
         private bool FindEndPositionDeltasDFAOptimized<TOptimizedInputReader, TAcceleratedStateHandler,
             TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
-                ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, long timeoutOccursAt)
+            long timeoutOccursAt, ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef,
+            ref int initialStatePosCandidateRef)
             where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
@@ -742,8 +774,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     if (TAcceleratedStateHandler.TryFindNextStartingPosition<TOptimizedInputReader>(
                     this, mtlookup, input, ref currStateId, ref pos, initialStateId))
                     {
-                        // future work could combine this with an immediate state transition
-                        // but this requires changing too much for now
+                        // a good potential future optimization here would
+                        // be to combine this with an immediate state transition
                         if (pos == input.Length)
                         {
                             // patterns such as ^$ can be nullable right away
@@ -769,7 +801,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     }
 
                     // If there is more input available try to transition with the next character.
-                    // Note: the order here is important so the transition gets taken
+                    // Note: the order here is important so the transition itself gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(
                     this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos),
                     timeoutOccursAt)
@@ -824,8 +856,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
         private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef,
-                long timeoutOccursAt)
+                long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef
+                )
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -1666,7 +1698,7 @@ public static void UndoTransition(ref CurrentState state)
         /// This input reader attempts to minimize overhead
         /// by handling constraints outside of the loop:
         /// 1. the position must be already valid for the input.
-        /// 2. the pattern must not to contain \Z.
+        /// 2. the pattern must not contain \Z.
         /// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm
         /// </summary>
         private interface IOptimizedInputReader
@@ -1690,7 +1722,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
         }
 
         /// <summary>
-        /// This reader is effectively an array lookup for the full 64k utf16 code unit mapping
+        /// This reader is effectively an array lookup for the all utf16 code units
         /// </summary>
         private readonly struct OptimizedFullInputReader : IOptimizedInputReader
         {
@@ -1703,6 +1735,10 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
             }
         }
 
+        /// <summary>
+        /// This nullability handler interface can be used in DFAs
+        /// for patterns that do not contain \Z
+        /// </summary>
         private interface IOptimizedNullabilityHandler
         {
             public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
@@ -1728,7 +1764,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
                 byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
-                Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}");
+                Debug.Assert(pos < input.Length, $"input end should not be handled here");
                 return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos));
             }
         }

From 53924eb9ac148b8ad236d855c4eafb0dd4eb4fee Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 1 Jul 2024 02:38:26 +0300
Subject: [PATCH 36/63] cleanup and comments

---
 .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs  | 9 ++-------
 .../Symbolic/SymbolicRegexRunnerFactory.cs               | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 037796198ed70..fef5b3ece17d3 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -665,8 +665,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 // catastrophic backtracking.  Catastrophic backtracking is not an issue for the NonBacktracking engine, but we
                 // still check the timeout now and again to provide some semblance of the behavior a developer experiences with
                 // the backtracking engines.  We can, however, choose a large number here, since it's not actually needed for security.
-                // reach speeds low enough for this to be relevant
-                // The fallback function has lower limits due to possibly worse performance
+                // The fallback function has lower limits due to worse performance from edge cases
                 int innerLoopLength;
                 bool done;
                 if (currentState.NfaState is null)
@@ -719,10 +718,6 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     CheckTimeout(timeoutOccursAt);
                 }
             }
-
-            // Check whether there's a fixed-length marker for the current state.  If there is, we can
-            // use that length to optimize subsequent matching phases.
-            // matchLength = endStateId > 0 ? GetState(endStateId).FixedLength(GetCharKind<TInputReader>(input, endPos)) : -1;
             return endPos;
         }
 
@@ -1468,7 +1463,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref Cur
                 return false;
             }
 
-            /// <summary>Take the transition to the next DFA state without paying for the NFA structure</summary>
+            /// <summary>Transition function that only considers DFA state id</summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
                 int mintermId, long timeoutOccursAt)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
index c046531f8a295..aa6708a60d01a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
@@ -21,7 +21,6 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
             var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping);
 
             SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root);
-            // rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode);
 
             // Determine if the root node is supported for safe handling
             int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold();

From e66d3d37575371fab6777366e4d8d2bb6c9b0f31 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 1 Jul 2024 14:51:58 +0300
Subject: [PATCH 37/63] reflecting new limits in tests

---
 .../Symbolic/SymbolicRegexMatcher.cs          | 21 ++++++++++++-------
 .../Symbolic/SymbolicRegexThresholds.cs       |  2 +-
 .../FunctionalTests/Regex.Match.Tests.cs      |  2 +-
 .../tests/UnitTests/SymbolicRegexTests.cs     |  6 +++---
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index fef5b3ece17d3..c95c1802c2a44 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -84,7 +84,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
         private readonly int _deadStateId;
 
-        /// <summary>Initial state used to for vectorization</summary>
+        /// <summary>Initial state used for vectorization</summary>
         private readonly int _initialStateId;
 
         /// <summary>Whether the pattern contains any anchor</summary>
@@ -785,7 +785,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     }
 
                     // If the state is nullable for the next character, we found a potential end state.
-                    if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(this, _nullabilityArray, currStateId, mtlookup, input, pos))
+                    if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(this, _nullabilityArray, currStateId, mtlookup,
+                            maxChar, input, pos))
                     {
                         endPos = pos;
                         // A match is known to exist.  If that's all we need to know, we're done.
@@ -1738,16 +1739,20 @@ private interface IOptimizedNullabilityHandler
         {
             public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
                 byte[] nullabilityArray, int
-                    currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+                    currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader;
         }
 
         private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+            public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup,
+                int maxChar, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
+                Debug.Assert(pos < input.Length, "input end should not be handled here");
+                Debug.Assert(currStateId < nullabilityArray.Length,
+                    "nullabilityArray grown but the reference is not up to date");
                 return nullabilityArray[currStateId] > 0;
             }
         }
@@ -1756,11 +1761,13 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
-                byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+                byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
-                Debug.Assert(pos < input.Length, $"input end should not be handled here");
-                return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos));
+                Debug.Assert(pos < input.Length, "input end should not be handled here");
+                Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
+                return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup,
+                    maxChar, input, pos));
             }
         }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index 5bcda9cfba731..b8c559135e5e4 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -30,7 +30,7 @@ internal static class SymbolicRegexThresholds
 
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
-        /// after the AST has been anlayzed for safe handling.
+        /// after the AST has been analyzed for safe handling.
         /// <remarks>
         /// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
         /// This default value may be overridden with the AppContext data
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 94ef063f1c079..bb3f7495f03fe 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck()
         {
             // This constant must be at least as large as the one in the implementation that sets the maximum number
             // of innermost loop iterations between timeout checks.
-            const int CharsToTriggerTimeoutCheck = 25000;
+            const int CharsToTriggerTimeoutCheck = 100000;
             // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger.
             Assert.Throws<RegexMatchTimeoutException>(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1))
                 .Match(new string('a', CharsToTriggerTimeoutCheck)));
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
index cbddba878edc2..7192b70cec451 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
@@ -209,13 +209,13 @@ public static IEnumerable<object[]> UnsafeThresholdTests_MemberData()
             [
                 // simple counters that are too large
                 "((ab){0,9000})",
-                "((ab){1000})",
+                "((ab){5000})",
                 "((ab){100,5000})",
                 // almost infinite lower bound
                 "a{2147483646,}",              // 2147483646 = int.MaxValue-1
                 // nested small counters causing unsafe blowup through multiplicative nature of counter nesting
-                "(((ab){10}){10}){10}",        // more than 10^3
-                "((((abcd){4}){4}){4}){4}",    // exponential: more than 4^5 = 1024
+                "(((ab){10}){10}){50}",        // more than 10^3 * 5
+                "(((((abcd){4}){4}){4}){4}){10}",    // exponential: more than 4^5 * 10 = 10240
                 // combined large counters
                 "((ab){1000}){1000}",          // more than 1000^2
                 "((ab){99999999}){99999999}",  // multiply: much more than int.MaxValue

From 65c0b8bce3fa583ebf9b47cd3561aab4fe4fc0a2 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 1 Jul 2024 21:53:25 +0300
Subject: [PATCH 38/63] rerunning tests

---
 .../Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index b8c559135e5e4..9509da2a751d8 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -24,7 +24,7 @@ internal static class SymbolicRegexThresholds
         /// to create a new node and the graph is already or newly beyond this threshold.
         /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB;
         /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially
-        /// significant search-time performance gains
+        /// significant search-time performance gains.
         /// </remarks>
         internal const int NfaThreshold = 25_000;
 

From de085b46a4ef486add83555696e64f3a43fde858 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 1 Jul 2024 23:05:26 +0300
Subject: [PATCH 39/63] retesting DFA timeout

---
 .../tests/FunctionalTests/Regex.Match.Tests.cs                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index bb3f7495f03fe..7f946adfd2d27 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck()
         {
             // This constant must be at least as large as the one in the implementation that sets the maximum number
             // of innermost loop iterations between timeout checks.
-            const int CharsToTriggerTimeoutCheck = 100000;
+            const int CharsToTriggerTimeoutCheck = 200000;
             // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger.
             Assert.Throws<RegexMatchTimeoutException>(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1))
                 .Match(new string('a', CharsToTriggerTimeoutCheck)));

From 5ef3b320ed5cf76b53cf48ae8c231370a95a98bf Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 2 Jul 2024 04:01:38 +0300
Subject: [PATCH 40/63] more precise regex memory limit for DFA mode

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  5 ++--
 .../Symbolic/SymbolicRegexMatcher.cs          | 25 ++++++++++---------
 .../Symbolic/SymbolicRegexThresholds.cs       | 17 ++++++-------
 .../tests/FunctionalTests/Regex.Ctor.Tests.cs |  2 +-
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index e5430249a86a8..58807073d944d 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -392,8 +392,9 @@ private bool TryCreateNewTransition(
                 {
                     if (// check if there is an active timer
                         (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) ||
-                        // check if size exceeds the NFA threshold
-                        (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold))
+                        // check if amount of nodes exceeds the NFA threshold
+                        (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)
+                    )
                     {
                         nextState = null;
                         return false;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index c95c1802c2a44..de2631aca0149 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -5,6 +5,7 @@
 using System.Diagnostics;
 using System.IO;
 using System.Numerics;
+using System.Reflection;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
@@ -220,7 +221,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
                         FindMode:
                         FindNextStartingPositionMode.FixedDistanceSets_LeftToRight or
                         FindNextStartingPositionMode.LeadingSet_LeftToRight} when
-                        _optimizedReversalState.Kind != MatchReversalKind.FixedLength => findOptimizations,
+                        _optimizedReversalState.Kind == MatchReversalKind.FixedLength => findOptimizations,
                     // string literals are the best case
                     {
                         FindMode:
@@ -573,9 +574,11 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
+            // TODO: possible this value could be removed
             int initialStatePosCandidate = pos;
             var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<FullInputReader>(input, pos - 1)]);
             int endPos = NoMatchExists;
+            int lengthMinus1 = input.Length - 1;
 
             while (true)
             {
@@ -584,15 +587,14 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                 if (currentState.NfaState is null)
                 {
                     const int dfaCharsPerTimeoutCheck = 100000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck
+                    innerLoopLength = _checkTimeout && lengthMinus1 - pos > dfaCharsPerTimeoutCheck
                         ? pos + dfaCharsPerTimeoutCheck
-                        : input.Length;
+                        : lengthMinus1;
                     done =
                         FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
                             TAcceleratedStateHandler,
-                            TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, timeoutOccursAt, ref pos,
-                            currentState.DfaStateId, ref endPos, ref initialStatePosCandidate,
-                            ref initialStatePosCandidate);
+                            TOptimizedNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos,
+                            ref currentState.DfaStateId, ref endPos);
                 }
                 else
                 {
@@ -729,8 +731,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
         /// </summary>
         private bool FindEndPositionDeltasDFAOptimized<TOptimizedInputReader, TAcceleratedStateHandler,
             TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
-            long timeoutOccursAt, ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef,
-            ref int initialStatePosCandidateRef)
+            long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef)
             where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
@@ -739,7 +740,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
             if (posRef == input.Length)
 
             {
-                if (_stateArray[startStateId]!.IsNullableFor(_positionKinds[0]))
+                if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0]))
                 {
                     // the end position kind was nullable
                     endPosRef = posRef;
@@ -751,7 +752,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
             int pos = posRef;
             int endPos = endPosRef;
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
-            int currStateId = startStateId;
+            int currStateId = currentStateIdRef;
             int deadStateId = _deadStateId;
             int initialStateId = _initialStateId;
             int maxChar = _mintermClassifier.MaxChar();
@@ -803,7 +804,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     timeoutOccursAt)
                         || pos >= lengthMinus1)
                     {
-                        if (pos + 1 < input.Length)
+                        if (pos < lengthMinus1)
                         {
                             return false;
                         }
@@ -829,7 +830,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef;
+                currentStateIdRef = currStateId;
             }
         }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index 9509da2a751d8..5dcaa31225941 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -8,25 +8,22 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </summary>
     internal static class SymbolicRegexThresholds
     {
-        /// <summary>Maximum number of built states before switching over to NFA mode.</summary>
+        /// <summary>Maximum number of <see cref="SymbolicRegexNode{TSet}"/> instances before switching over to NFA mode.</summary>
         /// <remarks>
         /// By default, all matching starts out using DFAs, where every state transitions to one and only one
         /// state for any minterm (each character maps to one minterm).  Some regular expressions, however, can result
         /// in really, really large DFA state graphs, much too big to actually store.  Instead of failing when we
         /// encounter such state graphs, at some point we instead switch from processing as a DFA to processing as
-        /// an NFA.  As an NFA, we instead track all of the states we're in at any given point, and transitioning
-        /// from one "state" to the next really means for every constituent state that composes our current "state",
-        /// we find all possible states that transitioning out of each of them could result in, and the union of
-        /// all of those is our new "state".  This constant represents the size of the graph after which we start
-        /// processing as an NFA instead of as a DFA.  This processing doesn't change immediately, however. All
-        /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex.
-        /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing
-        /// to create a new node and the graph is already or newly beyond this threshold.
+        /// an NFA. As an NFA, we instead track all of the states we're in at any given point.
+        /// </remarks>
+        /// <remarks>
         /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB;
         /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially
         /// significant search-time performance gains.
+        /// Worst case memory consumption for the regex instance can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode))
+        /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state.
         /// </remarks>
-        internal const int NfaThreshold = 25_000;
+        internal const int NfaNodeCountThreshold = 125_000;
 
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs
index cefad99252342..b9659996a4e51 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs
@@ -133,7 +133,7 @@ public static void Ctor_Invalid()
                 Assert.Throws<NotSupportedException>(() => new Regex(@"(?>a*)a", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and atomics
                 Assert.Throws<NotSupportedException>(() => new Regex(@"\Ga", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and start anchors
                 Assert.Throws<NotSupportedException>(() => new Regex(@"(?<C>A)(?<-C>B)$", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and balancing groups
-                Assert.Throws<NotSupportedException>(() => new Regex(@"\w{1,1001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion
+                Assert.Throws<NotSupportedException>(() => new Regex(@"\w{1,100001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion
             }
         }
 

From 281446fd3775eb5dda439769c770fcb617c0e3ff Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:59:04 +0300
Subject: [PATCH 41/63] reverting change

---
 .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index de2631aca0149..2f06c8e4af85c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -804,7 +804,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     timeoutOccursAt)
                         || pos >= lengthMinus1)
                     {
-                        if (pos < lengthMinus1)
+                        if (pos + 1 < input.Length)
                         {
                             return false;
                         }

From 8f78046589334e7c97851998f4ceecd58b789a46 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 3 Jul 2024 03:52:56 +0300
Subject: [PATCH 42/63] reverting reversal refactor

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs |  9 +-
 .../Symbolic/SymbolicRegexMatcher.cs          | 86 ++++++++++---------
 .../Symbolic/SymbolicRegexThresholds.cs       |  7 +-
 3 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 58807073d944d..3f096b97db3b5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -182,7 +182,7 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
         /// 2) the reversal starts at abc.*|
         /// </summary>
         /// <param name="node">reversed initial pattern</param>
-        /// <returns>returns n of chars to skip and adjusted reversal start state</returns>
+        /// <returns>returns num of chars to skip and adjusted reversal start state</returns>
         private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
         {
             int pos = 0;
@@ -194,12 +194,14 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                 pos = 0;
                 return (false, node);
             });
+
             var addSingleton = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
             {
                 pos += 1;
                 // continue with next concat
                 return (true, concatNode._right!);
             });
+
             var addFixedLengthLoop = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
             {
                 SymbolicRegexNode<TSet>? loopNode = concatNode._left;
@@ -207,6 +209,7 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                 {
                     return (false, concatNode);
                 }
+
                 switch (loopNode!._left!.Kind)
                 {
                     case SymbolicRegexNodeKind.Singleton:
@@ -217,9 +220,11 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                             // the entire loop is fixed, continue
                             return (true, concatNode._right!);
                         }
+
                         // subtract the fixed part of the loop
                         int loopRemainder = loopNode._upper - loopNode._lower;
-                        SymbolicRegexNode<TSet> newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
+                        SymbolicRegexNode<TSet> newLeft =
+                            _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
                         SymbolicRegexNode<TSet> newNode = _builder.CreateConcat(newLeft, concatNode._right!);
                         pos += loopNode._lower;
                         return (true, newNode);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 2f06c8e4af85c..89f2ae1fe8336 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -1,11 +1,11 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Buffers;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
 using System.Numerics;
-using System.Reflection;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
@@ -82,7 +82,10 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
         private readonly RegexFindOptimizations? _findOpts;
 
-        /// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
+        /// <summary>
+        /// Dead end state to quickly return NoMatch.
+        /// This could potentially be a constant if it's the very first state created
+        /// </summary>
         private readonly int _deadStateId;
 
         /// <summary>Initial state used for vectorization</summary>
@@ -91,7 +94,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Whether the pattern contains any anchor</summary>
         private readonly bool _containsAnyAnchor;
 
-        /// <summary>Whether the pattern contains the EndZ anchor which makes most optimizations invalid</summary>
+        /// <summary>Whether the pattern contains the EndZ anchor, which makes most optimization shortcuts invalid</summary>
         private readonly bool _containsEndZAnchor;
 
         /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
@@ -163,11 +166,11 @@ public static SymbolicRegexMatcher<TSet> Create(
 
             // Convert the BDD-based AST to TSet-based AST
             SymbolicRegexNode<TSet> rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver));
-            return new SymbolicRegexMatcher<TSet>(bddBuilder, builder, rootNode, captureCount, findOptimizations, matchTimeout);
+            return new SymbolicRegexMatcher<TSet>(builder, rootNode, captureCount, findOptimizations, matchTimeout);
         }
 
         /// <summary>Constructs matcher for given symbolic regex.</summary>
-        private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
+        private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout)
         {
             Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}");
 
@@ -215,13 +218,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<BDD> _, SymbolicRegexBuilder<T
                 // In some cases where the findOptimizations are useful, just using the DFA can still be faster.
                 _findOpts = findOptimizations switch
                 {
-                    // for sets in fixed length patterns just trust the optimizations,
-                    // the performance can be either better or worse depending on frequency
-                    {
-                        FindMode:
-                        FindNextStartingPositionMode.FixedDistanceSets_LeftToRight or
-                        FindNextStartingPositionMode.LeadingSet_LeftToRight} when
-                        _optimizedReversalState.Kind == MatchReversalKind.FixedLength => findOptimizations,
                     // string literals are the best case
                     {
                         FindMode:
@@ -230,11 +226,15 @@ FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
                         FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight
                     } => findOptimizations,
                     // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null
-                    { FindMode: FindNextStartingPositionMode.LeadingStrings_LeftToRight } => findOptimizations,
-                    { FindMode: FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight } => findOptimizations,
+                    {
+                        FindMode:
+                        FindNextStartingPositionMode.LeadingStrings_LeftToRight or
+                        FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight,
+                        LeadingStrings: not null
+                    } when findOptimizations.LeadingStrings.GetType().Name != "StringSearchValuesAhoCorasick`2" => findOptimizations,
                     // for singular character sets it depends if there's any reasonably small set to be accelerated
-                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => null,
-                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null,
+                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when !findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => findOptimizations,
+                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when !CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => findOptimizations,
                     _ => null
                 };
             }
@@ -291,31 +291,30 @@ FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
             _reverseInitialStates = reverseInitialStates;
 
 
-            // TODO: this is still work in progress
-            // The frequency of occurrences makes a big difference here,
-            // anything above 4 uint16 chars is generally slower than DFA, but
-            // if the characters are very rare, then SearchValues can be up to ~2x faster
-            // SearchValues<char> implementations to avoid:
-            // - ProbabilisticCharSearchValues
-            // - ProbabilisticWithAsciiCharSearchValues`1
-            // - AsciiCharSearchValues`1
-            // - Any5SearchValues`2"
-            // SearchValues<string> implementations to avoid:
-            // - StringSearchValuesAhoCorasick`2
+            // Some SearchValues<char> implementations are slower than a DFA,
+            // but depend on input frequency.
+            // This is currently tuned for consistency
+            // but it could return false to enable findOptimizations.
             bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet)
             {
+                char[]? chars = fixedDistanceSet.Chars;
+                bool avoidSearchValues = false;
+                if (chars is not null && chars.Length > 5)
+                {
+                    // RegexFindOptimizations picks 3 sets at most so the construction overhead should not be too high
+                    var searchValues = SearchValues.Create(chars);
+                    avoidSearchValues = searchValues.GetType().Name switch
+                    {
+                        "ProbabilisticCharSearchValues" => true,
+                        "ProbabilisticWithAsciiCharSearchValues`1" => true,
+                        "AsciiCharSearchValues`1" => true,
+                        _ => false
+                    };
+                }
+
                 return fixedDistanceSet switch
                 {
-                    // anything above 4 uint16 chars is generally slower than DFA
-                    { Chars: not null } =>
-                        // negated sets are usually large
-                        fixedDistanceSet.Negated ||
-                        (fixedDistanceSet.Chars.Length > 4
-                        // TODO: this extra condition is currently kept so there's no regressions
-                        // if ~500mb/s worst case is acceptable then this could be removed
-                        // but being able to guess which character sets are not too frequent can
-                        // often reach over 1gb/s with AVX
-                        && Array.Exists(fixedDistanceSet.Chars, char.IsAsciiLetterLower)),
+                    { Chars: not null } => fixedDistanceSet.Negated || avoidSearchValues,
                     { Range: not null } => false,
                     _ => false,
                 };
@@ -429,9 +428,10 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // It returns NoMatchExists (-2) when there is no match.
             // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
             // the position of the last b: aacaaaabbbc.  It additionally records the position of the first a after
-            // the c as the low boundary for the starting position.d
-            int matchEnd;
+            // the c as the low boundary for the starting position.
+
             // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases
+            int matchEnd;
             if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null))
             {
                 matchEnd = (_mintermClassifier.IsFullLookup(), _findOpts is not null, _containsAnyAnchor) switch
@@ -568,13 +568,19 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             }
         }
 
+        /// <summary>
+        /// This version of <see cref="FindEndPositionFallback"/> uses a different set of interfaces,
+        /// which don't check for many inner loop edge cases e.g. input end or '\n'.
+        /// All edge cases are handled before entering the loop.
+        /// </summary>
         private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
             ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
             where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
-            // TODO: possible this value could be removed
+            // this initial state candidate is not really used in the common DFA case
+            // and could potentially be removed in the future
             int initialStatePosCandidate = pos;
             var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<FullInputReader>(input, pos - 1)]);
             int endPos = NoMatchExists;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index 5dcaa31225941..bf7d5a6501699 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -17,10 +17,11 @@ internal static class SymbolicRegexThresholds
         /// an NFA. As an NFA, we instead track all of the states we're in at any given point.
         /// </remarks>
         /// <remarks>
-        /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB;
+        /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance
+        /// is currently approx. 50 MB.
         /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially
-        /// significant search-time performance gains.
-        /// Worst case memory consumption for the regex instance can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode))
+        /// significant search-time performance gains. Worst case memory consumption for the regex instance
+        /// can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode))
         /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state.
         /// </remarks>
         internal const int NfaNodeCountThreshold = 125_000;

From 715752084280b47cbdfe6cabfb5079a8d100210e Mon Sep 17 00:00:00 2001
From: ieviev <36763595+ieviev@users.noreply.github.com>
Date: Wed, 3 Jul 2024 23:49:15 +0300
Subject: [PATCH 43/63] Apply suggestions from code review

Co-authored-by: Dan Moseley <danmose@microsoft.com>
---
 .../RegularExpressions/Symbolic/MintermClassifier.cs  |  1 -
 .../Symbolic/SymbolicRegexMatcher.cs                  | 11 ++++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 323cc6cdcc316..9e3dad73e5bff 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -28,7 +28,6 @@ internal sealed class MintermClassifier
         /// </summary>
         private readonly int[]? _intLookup;
 
-
         /// <summary>
         /// Maximum ordinal character for a non-0 minterm, used to conserve memory
         /// Note: this is maximum index allowed for the lookup, the array size is _maxChar + 1
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 89f2ae1fe8336..3ca0ddfd41e06 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -246,7 +246,6 @@ FindNextStartingPositionMode.LeadingStrings_LeftToRight or
             // The loops below and how character kinds are calculated assume that the "general" character kind is zero
             Debug.Assert(CharKind.General == 0);
 
-
             // Assign edge case info for quick lookup
             _containsAnyAnchor = _pattern._info.ContainsSomeAnchor;
             _containsEndZAnchor = _pattern._info.ContainsEndZAnchor;
@@ -290,7 +289,6 @@ FindNextStartingPositionMode.LeadingStrings_LeftToRight or
             }
             _reverseInitialStates = reverseInitialStates;
 
-
             // Some SearchValues<char> implementations are slower than a DFA,
             // but depend on input frequency.
             // This is currently tuned for consistency
@@ -592,7 +590,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                 bool done;
                 if (currentState.NfaState is null)
                 {
-                    const int dfaCharsPerTimeoutCheck = 100000;
+                    const int DfaCharsPerTimeoutCheck = 100000;
                     innerLoopLength = _checkTimeout && lengthMinus1 - pos > dfaCharsPerTimeoutCheck
                         ? pos + dfaCharsPerTimeoutCheck
                         : lengthMinus1;
@@ -606,7 +604,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                 {
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
                     // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
-                    const int nfaCharsPerTimeoutCheck = 1000;
+                    const int NfaCharsPerTimeoutCheck = 1000;
                     innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck
                         ? pos + nfaCharsPerTimeoutCheck
                         : input.Length;
@@ -678,7 +676,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 bool done;
                 if (currentState.NfaState is null)
                 {
-                    const int dfaCharsPerTimeoutCheck = 25000;
+                    const int DfaCharsPerTimeoutCheck = 25000;
                     innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck
                         ? pos + dfaCharsPerTimeoutCheck
                         : input.Length;
@@ -691,7 +689,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 {
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
                     // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
-                    const int nfaCharsPerTimeoutCheck = 1000;
+                    const int NfaCharsPerTimeoutCheck = 1000;
                     innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck
                         ? pos + nfaCharsPerTimeoutCheck
                         : input.Length;
@@ -840,7 +838,6 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
             }
         }
 
-
         /// <summary>
         /// Workhorse inner loop for <see cref="FindEndPositionFallback{TInputReader,TFindOptimizationsHandler,TNullabilityHandler}"/>.  Consumes the <paramref name="input"/> character by character,
         /// starting at <paramref name="posRef"/>, for each character transitioning from one state in the DFA or NFA graph to the next state,

From 931552d4de8dd787a8687fad71c89dc641b47984 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 3 Jul 2024 23:52:09 +0300
Subject: [PATCH 44/63] variable naming

---
 .../Symbolic/SymbolicRegexMatcher.cs             | 16 ++++++++--------
 .../tests/FunctionalTests/Regex.Match.Tests.cs   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 3ca0ddfd41e06..8e67be2ce7cf7 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -591,8 +591,8 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                 if (currentState.NfaState is null)
                 {
                     const int DfaCharsPerTimeoutCheck = 100000;
-                    innerLoopLength = _checkTimeout && lengthMinus1 - pos > dfaCharsPerTimeoutCheck
-                        ? pos + dfaCharsPerTimeoutCheck
+                    innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck
+                        ? pos + DfaCharsPerTimeoutCheck
                         : lengthMinus1;
                     done =
                         FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
@@ -605,8 +605,8 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
                     // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
                     const int NfaCharsPerTimeoutCheck = 1000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck
-                        ? pos + nfaCharsPerTimeoutCheck
+                    innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck
+                        ? pos + NfaCharsPerTimeoutCheck
                         : input.Length;
                     done =
                         FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
@@ -677,8 +677,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 if (currentState.NfaState is null)
                 {
                     const int DfaCharsPerTimeoutCheck = 25000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck
-                        ? pos + dfaCharsPerTimeoutCheck
+                    innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck
+                        ? pos + DfaCharsPerTimeoutCheck
                         : input.Length;
                     done =
                         FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
@@ -690,8 +690,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
                     // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
                     const int NfaCharsPerTimeoutCheck = 1000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck
-                        ? pos + nfaCharsPerTimeoutCheck
+                    innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck
+                        ? pos + NfaCharsPerTimeoutCheck
                         : input.Length;
                     done =
                         FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler,
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 7f946adfd2d27..af5adb063bfb7 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck()
         {
             // This constant must be at least as large as the one in the implementation that sets the maximum number
             // of innermost loop iterations between timeout checks.
-            const int CharsToTriggerTimeoutCheck = 200000;
+            const int CharsToTriggerTimeoutCheck = 200_000;
             // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger.
             Assert.Throws<RegexMatchTimeoutException>(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1))
                 .Match(new string('a', CharsToTriggerTimeoutCheck)));

From cc493f13679e21f3662b33ce5018d7e29e644408 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Wed, 3 Jul 2024 23:54:26 +0300
Subject: [PATCH 45/63] test for over 255 minterms

---
 .../FunctionalTests/Regex.Match.Tests.cs      | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index af5adb063bfb7..b96d5459f6e8b 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2653,5 +2653,28 @@ public static IEnumerable<object[]> MatchWordsInAnchoredRegexes_TestData()
                 yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } };
             }
         }
+
+        [Fact]
+        public async Task MatchNonBacktrackingOver255Minterms()
+        {
+            // This is a test for the rare over 255 unique minterms case in MintermClassifier
+            StringBuilder pattern = new();
+            StringBuilder input = new();
+            for (int i = 256; i <= 768; i++)
+            {
+                string str = new Rune(i).ToString();
+                pattern.Append(str);
+                // adding an optional char as well just so it's not a string literal
+                pattern.Append(str);
+                pattern.Append('?');
+                // input is the pattern itself
+                input.Append(str);
+            }
+            Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, pattern.ToString(), RegexOptions.None);
+            MatchCollection ms = r.Matches(input.ToString());
+            Assert.Equal(1, ms.Count);
+            Assert.Equal(0, ms[0].Index);
+            Assert.Equal(513, ms[0].Length);
+        }
     }
 }

From a0d239064b276123b4e3726e12eddf34dcf007d9 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 00:22:50 +0300
Subject: [PATCH 46/63] adding net directive around test

---
 .../tests/FunctionalTests/Regex.Match.Tests.cs                 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index b96d5459f6e8b..51b3d926e66fd 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2653,7 +2653,7 @@ public static IEnumerable<object[]> MatchWordsInAnchoredRegexes_TestData()
                 yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } };
             }
         }
-
+#if NET
         [Fact]
         public async Task MatchNonBacktrackingOver255Minterms()
         {
@@ -2676,5 +2676,6 @@ public async Task MatchNonBacktrackingOver255Minterms()
             Assert.Equal(0, ms[0].Index);
             Assert.Equal(513, ms[0].Length);
         }
+#endif
     }
 }

From 0691c5894326a2e25b757eb06640c8c9eece3736 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 00:34:17 +0300
Subject: [PATCH 47/63] all engines in minterms test

---
 .../tests/FunctionalTests/Regex.Match.Tests.cs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 51b3d926e66fd..993879441bb28 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2670,11 +2670,19 @@ public async Task MatchNonBacktrackingOver255Minterms()
                 // input is the pattern itself
                 input.Append(str);
             }
-            Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, pattern.ToString(), RegexOptions.None);
-            MatchCollection ms = r.Matches(input.ToString());
-            Assert.Equal(1, ms.Count);
-            Assert.Equal(0, ms[0].Index);
-            Assert.Equal(513, ms[0].Length);
+
+            // just so it's not allocated multiple times
+            string patternString = pattern.ToString();
+            string inputString = input.ToString();
+
+            foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+            {
+                Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None);
+                MatchCollection ms = r.Matches(inputString);
+                Assert.Equal(1, ms.Count);
+                Assert.Equal(0, ms[0].Index);
+                Assert.Equal(513, ms[0].Length);
+            }
         }
 #endif
     }

From 8ceb20767ee48052122db04dfce3c1ca1e645476 Mon Sep 17 00:00:00 2001
From: ieviev <36763595+ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 01:50:59 +0300
Subject: [PATCH 48/63] Apply suggestions from code review

Co-authored-by: Stephen Toub <stoub@microsoft.com>
---
 .../Symbolic/SymbolicRegexMatcher.cs          | 91 +++++++++----------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 8e67be2ce7cf7..4f1f13ef9fc60 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -510,12 +510,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                     {
                         i -= _optimizedReversalState.FixedLength;
                         reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!);
+
                         // reversal may already be nullable here in the case of anchors
                         if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0)
                         {
                             if (FullNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
-                                    in reversalStartState, FullInputReader.GetPositionId(this, input, i),
-                                    DfaStateHandler.GetStateFlags(this, in reversalStartState)))
+                                in reversalStartState, FullInputReader.GetPositionId(this, input, i),
+                                DfaStateHandler.GetStateFlags(this, in reversalStartState)))
                             {
                                 initialLastStart = i;
                             }
@@ -523,8 +524,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                     }
                     else
                     {
-                        reversalStartState = new CurrentState(_reverseInitialStates[
-                            GetCharKind<FullInputReader>(input, matchEnd)]);
+                        reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind<FullInputReader>(input, matchEnd)]);
+
                     }
                     matchStart = matchEnd < startat
                         ? startat
@@ -567,9 +568,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
         }
 
         /// <summary>
-        /// This version of <see cref="FindEndPositionFallback"/> uses a different set of interfaces,
-        /// which don't check for many inner loop edge cases e.g. input end or '\n'.
-        /// All edge cases are handled before entering the loop.
+        /// Streamlined version of <see cref="FindEndPositionFallback"/> that doesn't handle /z anchors or very large sets of minterms.
         /// </summary>
         private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
             ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
@@ -590,7 +589,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                 bool done;
                 if (currentState.NfaState is null)
                 {
-                    const int DfaCharsPerTimeoutCheck = 100000;
+                    const int DfaCharsPerTimeoutCheck = 100_000;
                     innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck
                         ? pos + DfaCharsPerTimeoutCheck
                         : lengthMinus1;
@@ -603,7 +602,6 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                 else
                 {
                     // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
-                    // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
                     const int NfaCharsPerTimeoutCheck = 1000;
                     innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck
                         ? pos + NfaCharsPerTimeoutCheck
@@ -772,10 +770,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     }
 
                     if (TAcceleratedStateHandler.TryFindNextStartingPosition<TOptimizedInputReader>(
-                    this, mtlookup, input, ref currStateId, ref pos, initialStateId))
+                        this, mtlookup, input, ref currStateId, ref pos, initialStateId))
                     {
-                        // a good potential future optimization here would
-                        // be to combine this with an immediate state transition
                         if (pos == input.Length)
                         {
                             // patterns such as ^$ can be nullable right away
@@ -784,16 +780,18 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                                 // the end position kind was nullable
                                 endPos = pos;
                             }
+
                             currStateId = deadStateId;
                             continue;
                         }
                     }
 
                     // If the state is nullable for the next character, we found a potential end state.
-                    if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(this, _nullabilityArray, currStateId, mtlookup,
-                            maxChar, input, pos))
+                    if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(
+                        this, _nullabilityArray, currStateId, mtlookup, maxChar, input, pos))
                     {
                         endPos = pos;
+
                         // A match is known to exist.  If that's all we need to know, we're done.
                         if (mode == RegexRunnerMode.ExistenceRequired)
                         {
@@ -804,27 +802,28 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition itself gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(
-                    this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos),
-                    timeoutOccursAt)
-                        || pos >= lengthMinus1)
+                        this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos), timeoutOccursAt) ||
+                        pos >= lengthMinus1)
                     {
                         if (pos + 1 < input.Length)
                         {
                             return false;
                         }
                         pos++;
+
                         // one off check for the final position
                         // this is just to move it out of the hot loop
                         if (!(_stateFlagsArray[currStateId].IsNullable() ||
-                              _stateArray[currStateId]!.IsNullableFor(
-                                  GetPositionKind(-1))))
+                              _stateArray[currStateId]!.IsNullableFor(GetPositionKind(-1))))
                         {
                             return true;
+
                         }
                         // the end position (-1) was nullable
                         endPos = pos;
                         return true;
                     }
+
                     // We successfully transitioned, so update our current input index to match.
                     pos++;
                 }
@@ -856,8 +855,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
         /// A negative value if iteration completed because we ran out of input or we failed to transition.
         /// </returns>
         private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
-                long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef
-                )
+            long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
             where TStateHandler : struct, IStateHandler
             where TInputReader : struct, IInputReader
             where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -879,6 +877,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     {
                         return true;
                     }
+
                     if (state.DfaStateId == initialStateId)
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
@@ -893,9 +892,10 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
                     if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state,
-                            positionId, TStateHandler.GetStateFlags(this, in state)))
+                        positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         endPos = pos;
+
                         // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
                         initialStatePos = initialStatePosCandidate;
 
@@ -907,8 +907,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                     }
 
                     // If there is more input available try to transition with the next character.
-                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state,
-                            positionId, timeoutOccursAt))
+                    if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt))
                     {
                         return false;
                     }
@@ -922,7 +921,6 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                 // Write back the local copies of the ref values.
                 posRef = pos;
                 endPosRef = endPos;
-                // endStateIdRef = endStateId;
                 initialStatePosRef = initialStatePos;
                 initialStatePosCandidateRef = initialStatePosCandidate;
             }
@@ -1078,10 +1076,11 @@ private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilit
                 while (true)
                 {
                     int positionId = TInputReader.GetPositionId(this, input, pos - 1);
+
                     // If the state accepts the empty string, we found a valid starting position.  Record it and keep going,
                     // since we're looking for the earliest one to occur within bounds.
-                    if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
-                            TStateHandler.GetStateFlags(this, in state)))
+                    if (_nullabilityArray[state.DfaStateId] > 0 &&
+                        TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         lastStart = pos;
                     }
@@ -1128,8 +1127,7 @@ private bool FindStartPositionDeltasNFA<TStateHandler, TInputReader, TNullabilit
 
                     // If the state accepts the empty string, we found a valid starting position.  Record it and keep going,
                     // since we're looking for the earliest one to occur within bounds.
-                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
-                            TStateHandler.GetStateFlags(this, in state)))
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         lastStart = pos;
                     }
@@ -1473,7 +1471,8 @@ public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref Cur
             public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
                 int mintermId, long timeoutOccursAt)
             {
-                Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}.");
+                Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0");
+
                 // Use the mintermId for the character being read to look up which state to transition to.
                 // If that state has already been materialized, move to it, and we're done. If that state
                 // hasn't been materialized, try to create it; if we can, move to it, and we're done.
@@ -1487,14 +1486,15 @@ public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref
                 }
 
                 if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId,
-                        matcher.DeltaOffset(state, mintermId),
-                        checkThreshold: true, out MatchingState<TSet>? nextState, timeoutOccursAt))
+                    matcher.DeltaOffset(state, mintermId),
+                    checkThreshold: true, out MatchingState<TSet>? nextState, timeoutOccursAt))
                 {
                     // We were able to create a new DFA transition to some state. Move to it and
                     // return that we're still operating as a DFA and can keep going.
                     state = nextState.Id;
                     return true;
                 }
+
                 return false;
             }
 
@@ -1703,8 +1703,7 @@ public static void UndoTransition(ref CurrentState state)
         /// </summary>
         private interface IOptimizedInputReader
         {
-            public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input,
-                int pos);
+            public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos);
         }
 
         /// <summary>
@@ -1717,7 +1716,8 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
             {
                 Debug.Assert(pos < input.Length, "pos < input.Length");
                 Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}");
-                return input[pos] > maxChar ? 0 : lookup[input[pos]];
+                char c = input[pos];
+                return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
             }
         }
 
@@ -1742,8 +1742,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
         private interface IOptimizedNullabilityHandler
         {
             public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
-                byte[] nullabilityArray, int
-                    currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
+                byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
                 where TOptimizedInputReader : struct, IOptimizedInputReader;
         }
 
@@ -1755,8 +1754,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
                 where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 Debug.Assert(pos < input.Length, "input end should not be handled here");
-                Debug.Assert(currStateId < nullabilityArray.Length,
-                    "nullabilityArray grown but the reference is not up to date");
+                Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
                 return nullabilityArray[currStateId] > 0;
             }
         }
@@ -1770,8 +1768,9 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
             {
                 Debug.Assert(pos < input.Length, "input end should not be handled here");
                 Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
-                return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup,
-                    maxChar, input, pos));
+                return
+                    nullabilityArray[currStateId] > 0 &&
+                    matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, maxChar, input, pos));
             }
         }
 
@@ -1827,8 +1826,8 @@ public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRe
         private interface IAcceleratedStateHandler
         {
             public static abstract bool TryFindNextStartingPosition<TOptimizedInputReader>(
-                SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input, ref
-                int currentStateId, ref int pos, int initialStateId)
+                SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input,
+                ref int currentStateId, ref int pos, int initialStateId)
                 where TOptimizedInputReader : struct, IOptimizedInputReader;
         }
 
@@ -1836,13 +1835,13 @@ public static abstract bool TryFindNextStartingPosition<TOptimizedInputReader>(
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
-                byte[] lookup,
-                ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
+                byte[] lookup, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
                 where TOptimizedInputReader : struct, IOptimizedInputReader
-
             {
                 if (currentStateId != initialStateId)
+                {
                     return false;
+                }
 
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {

From 379519b032a327c2bf39fc601abf00de365062a5 Mon Sep 17 00:00:00 2001
From: ieviev <36763595+ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 02:02:07 +0300
Subject: [PATCH 49/63] Apply suggestions from code review

Co-authored-by: Stephen Toub <stoub@microsoft.com>
---
 .../Symbolic/MintermClassifier.cs             |  2 +-
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 43 +++++++++----------
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 9e3dad73e5bff..b61a2f2cc96fb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -137,6 +137,6 @@ public int GetMintermID(int c)
         /// Maximum ordinal character for a non-0 minterm, used to conserve memory
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public int MaxChar() => _maxChar;
+        public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 3f096b97db3b5..c97b5e15b6403 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -188,6 +188,7 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
             int pos = 0;
             SymbolicRegexNode<TSet>? current = node;
             bool canLoop = true;
+
             // finding anchors inside pattern invalidates this optimization
             var bail = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
             {
@@ -216,18 +217,18 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
 
                         if (loopNode._lower == loopNode._upper)
                         {
-                            pos += loopNode._lower;
                             // the entire loop is fixed, continue
+                            pos += loopNode._lower;
                             return (true, concatNode._right!);
                         }
 
                         // subtract the fixed part of the loop
                         int loopRemainder = loopNode._upper - loopNode._lower;
-                        SymbolicRegexNode<TSet> newLeft =
-                            _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
+                        SymbolicRegexNode<TSet> newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
                         SymbolicRegexNode<TSet> newNode = _builder.CreateConcat(newLeft, concatNode._right!);
                         pos += loopNode._lower;
                         return (true, newNode);
+
                     default:
                         return (false, concatNode);
                 }
@@ -237,36 +238,32 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
             {
                 (bool loop, SymbolicRegexNode<TSet> next) = current switch
                 {
-                    // This could potentially be a very good future optimization for
+                    // Bail if it contains any anchors. (This could potentially be a very good future optimization for
                     // anchors but there's too many edge cases to guarantee it works.
-                    // one example which fails currently: pattern: @"\By\b", input: "xy"
+                    // one example which fails currently: pattern: @"\By\b", input: "xy")
                     { _info.ContainsSomeAnchor: true } => bail(current),
+
                     // if this is reached then entire match is fixed length
                     { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon),
-                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} =>
-                        (true, current._right!),
-                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } =>
-                        (true, current._right!),
-                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} =>
-                        addSingleton(current),
-                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
-                        addFixedLengthLoop(current),
+
+                    { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!),
+
+                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!),
+
+                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => addSingleton(current),
+
+                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => addFixedLengthLoop(current),
+
                     _ => (false, current)
                 };
                 canLoop = loop;
                 current = next;
             }
 
-            MatchReversal<TSet> reversal =
-                (pos, current) switch
-                {
-                    { pos: > 0 } when current == _builder.Epsilon => new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos),
-                    { pos: > 0 } => new MatchReversal<TSet>(MatchReversalKind.PartialFixedLength, pos,
-                        GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)),
-                    _ => new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0)
-                };
-
-            return reversal;
+                return
+                    pos <= 0 ? new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0) :
+                    current == _builder.Epsilon ? new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos) :
+                    new MatchReversal<TSet>(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0));
         }
 
         /// <summary>

From 57c8f6d41385de279752413743d3824da79a155b Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 02:37:18 +0300
Subject: [PATCH 50/63] simplifying code

---
 .../Symbolic/MintermClassifier.cs             |  15 --
 .../Symbolic/SymbolicRegexMatcher.cs          | 154 ++++--------------
 .../tests/UnitTests/SymbolicRegexTests.cs     |  14 --
 3 files changed, 31 insertions(+), 152 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index b61a2f2cc96fb..41a6c9b007593 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -54,13 +54,6 @@ public MintermClassifier(BDD[] minterms)
             {
                 _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
             }
-            // there is an opportunity to gain around 5% performance for allocating the
-            // full 64K, past a certain threshold where maxChar is already large.
-            // TODO: what should this threshold be?
-            if (_maxChar > 32_000)
-            {
-                _maxChar = ushort.MaxValue;
-            }
 
             // It's incredibly rare for a regex to use more than a hundred or two minterms,
             // but we need a fallback just in case.
@@ -125,14 +118,6 @@ public int GetMintermID(int c)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int[]? IntLookup() => _intLookup;
 
-        /// <summary>
-        /// Whether the full 64K char lookup is allocated.
-        /// This accelerates the minterm mapping by removing an if-else case,
-        /// and is only considered for the common &lt;= 255 minterms case
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public bool IsFullLookup() => _lookup is not null && _lookup.Length == ushort.MaxValue + 1;
-
         /// <summary>
         /// Maximum ordinal character for a non-0 minterm, used to conserve memory
         /// </summary>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 4f1f13ef9fc60..d7582f2e27c2f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -110,7 +110,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         private readonly MatchingState<TSet>[] _reverseInitialStates;
 
         /// <summary>
-        /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state.
+        /// Reversal state which skips fixed length parts.
         /// </summary>
         private readonly MatchReversal<TSet> _optimizedReversalState;
 
@@ -215,28 +215,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             if (findOptimizations.IsUseful &&
                 findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning)
             {
-                // In some cases where the findOptimizations are useful, just using the DFA can still be faster.
-                _findOpts = findOptimizations switch
-                {
-                    // string literals are the best case
-                    {
-                        FindMode:
-                        FindNextStartingPositionMode.LeadingString_LeftToRight or
-                        FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
-                        FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight
-                    } => findOptimizations,
-                    // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null
-                    {
-                        FindMode:
-                        FindNextStartingPositionMode.LeadingStrings_LeftToRight or
-                        FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight,
-                        LeadingStrings: not null
-                    } when findOptimizations.LeadingStrings.GetType().Name != "StringSearchValuesAhoCorasick`2" => findOptimizations,
-                    // for singular character sets it depends if there's any reasonably small set to be accelerated
-                    { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when !findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => findOptimizations,
-                    { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when !CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => findOptimizations,
-                    _ => null
-                };
+                _findOpts = findOptimizations;
             }
 
             // Determine the number of initial states. If there's no anchor, only the default previous
@@ -289,35 +268,6 @@ FindNextStartingPositionMode.LeadingStrings_LeftToRight or
             }
             _reverseInitialStates = reverseInitialStates;
 
-            // Some SearchValues<char> implementations are slower than a DFA,
-            // but depend on input frequency.
-            // This is currently tuned for consistency
-            // but it could return false to enable findOptimizations.
-            bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet)
-            {
-                char[]? chars = fixedDistanceSet.Chars;
-                bool avoidSearchValues = false;
-                if (chars is not null && chars.Length > 5)
-                {
-                    // RegexFindOptimizations picks 3 sets at most so the construction overhead should not be too high
-                    var searchValues = SearchValues.Create(chars);
-                    avoidSearchValues = searchValues.GetType().Name switch
-                    {
-                        "ProbabilisticCharSearchValues" => true,
-                        "ProbabilisticWithAsciiCharSearchValues`1" => true,
-                        "AsciiCharSearchValues`1" => true,
-                        _ => false
-                    };
-                }
-
-                return fixedDistanceSet switch
-                {
-                    { Chars: not null } => fixedDistanceSet.Negated || avoidSearchValues,
-                    { Range: not null } => false,
-                    _ => false,
-                };
-            }
-
             // Maps a minterm ID to a character kind
             uint CalculateMintermIdKind(int mintermId)
             {
@@ -432,32 +382,20 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             int matchEnd;
             if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null))
             {
-                matchEnd = (_mintermClassifier.IsFullLookup(), _findOpts is not null, _containsAnyAnchor) switch
+                matchEnd = (_findOpts is not null, _containsAnyAnchor) switch
                 {
-                    (false, true, true) =>
+                    (true, true) =>
                         FindEndPositionOptimized<OptimizedSmallInputReader, AcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, true, false) =>
+                    (true, false) =>
                         FindEndPositionOptimized<OptimizedSmallInputReader, NoAnchorAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false, false) =>
+                    (false, false) =>
                         FindEndPositionOptimized<OptimizedSmallInputReader, NoAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false, true) =>
+                    (false, true) =>
                         FindEndPositionOptimized<OptimizedSmallInputReader, NoAcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, true, false) =>
-                        FindEndPositionOptimized<OptimizedFullInputReader, NoAnchorAcceleratedStateHandler,
-                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, true, true) =>
-                        FindEndPositionOptimized<OptimizedFullInputReader, AcceleratedStateHandler,
-                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, false, false) =>
-                        FindEndPositionOptimized<OptimizedFullInputReader, NoAcceleratedStateHandler,
-                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, false, true) =>
-                        FindEndPositionOptimized<OptimizedFullInputReader, NoAcceleratedStateHandler,
-                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                 };
             }
             else
@@ -572,7 +510,6 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
         /// </summary>
         private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
             ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
-            where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
@@ -594,7 +531,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
                         ? pos + DfaCharsPerTimeoutCheck
                         : lengthMinus1;
                     done =
-                        FindEndPositionDeltasDFAOptimized<TOptimizedInputReader,
+                        FindEndPositionDeltasDFAOptimized<
                             TAcceleratedStateHandler,
                             TOptimizedNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos,
                             ref currentState.DfaStateId, ref endPos);
@@ -731,10 +668,9 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
         /// which don't check for many inner loop edge cases e.g. input end or '\n'.
         /// All edge cases are handled before entering the loop.
         /// </summary>
-        private bool FindEndPositionDeltasDFAOptimized<TOptimizedInputReader, TAcceleratedStateHandler,
+        private bool FindEndPositionDeltasDFAOptimized<TAcceleratedStateHandler,
             TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
             long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef)
-            where TOptimizedInputReader : struct, IOptimizedInputReader
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
@@ -769,7 +705,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                         return true;
                     }
 
-                    if (TAcceleratedStateHandler.TryFindNextStartingPosition<TOptimizedInputReader>(
+                    if (TAcceleratedStateHandler.TryFindNextStartingPosition(
                         this, mtlookup, input, ref currStateId, ref pos, initialStateId))
                     {
                         if (pos == input.Length)
@@ -787,7 +723,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     }
 
                     // If the state is nullable for the next character, we found a potential end state.
-                    if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(
+                    if (TOptimizedNullabilityHandler.IsNullable(
                         this, _nullabilityArray, currStateId, mtlookup, maxChar, input, pos))
                     {
                         endPos = pos;
@@ -802,7 +738,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition itself gets taken
                     if (!DfaStateHandler.TryTakeDFATransition(
-                        this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos), timeoutOccursAt) ||
+                        this, ref currStateId, OptimizedSmallInputReader.GetPositionId(mtlookup, maxChar, input, pos), timeoutOccursAt) ||
                         pos >= lengthMinus1)
                     {
                         if (pos + 1 < input.Length)
@@ -1694,22 +1630,12 @@ public static void UndoTransition(ref CurrentState state)
 #endif
         }
 
-        /// <summary>
-        /// This input reader attempts to minimize overhead
-        /// by handling constraints outside of the loop:
-        /// 1. the position must be already valid for the input.
-        /// 2. the pattern must not contain \Z.
-        /// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm
-        /// </summary>
-        private interface IOptimizedInputReader
-        {
-            public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos);
-        }
 
-        /// <summary>
-        /// This reader maps all characters > maxChar to 0
-        /// </summary>
-        private readonly struct OptimizedSmallInputReader : IOptimizedInputReader
+
+        // /// <summary>
+        // /// This reader maps all characters > maxChar to 0
+        // /// </summary>
+        private readonly struct OptimizedSmallInputReader
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
@@ -1721,37 +1647,22 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
             }
         }
 
-        /// <summary>
-        /// This reader is effectively an array lookup for the all utf16 code units
-        /// </summary>
-        private readonly struct OptimizedFullInputReader : IOptimizedInputReader
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
-            {
-                Debug.Assert(pos < input.Length);
-                Debug.Assert(lookup.Length == (ushort.MaxValue + 1));
-                return lookup[input[pos]];
-            }
-        }
-
         /// <summary>
         /// This nullability handler interface can be used in DFAs
         /// for patterns that do not contain \Z
         /// </summary>
         private interface IOptimizedNullabilityHandler
         {
-            public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
-                byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
-                where TOptimizedInputReader : struct, IOptimizedInputReader;
+            public static abstract bool IsNullable(SymbolicRegexMatcher<TSet> matcher,
+                byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input,
+                int pos);
         }
 
         private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup,
+            public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup,
                 int maxChar, ReadOnlySpan<char> input, int pos)
-                where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 Debug.Assert(pos < input.Length, "input end should not be handled here");
                 Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
@@ -1762,15 +1673,15 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
         private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+            public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher,
                 byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
-                where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 Debug.Assert(pos < input.Length, "input end should not be handled here");
                 Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
                 return
                     nullabilityArray[currStateId] > 0 &&
-                    matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, maxChar, input, pos));
+                    matcher.IsNullableWithContext(currStateId,
+                        input[pos] < (uint)lookup.Length ? lookup[input[pos]] : 0);
             }
         }
 
@@ -1825,18 +1736,16 @@ public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRe
         /// </summary>
         private interface IAcceleratedStateHandler
         {
-            public static abstract bool TryFindNextStartingPosition<TOptimizedInputReader>(
+            public static abstract bool TryFindNextStartingPosition(
                 SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input,
-                ref int currentStateId, ref int pos, int initialStateId)
-                where TOptimizedInputReader : struct, IOptimizedInputReader;
+                ref int currentStateId, ref int pos, int initialStateId);
         }
 
         private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+            public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher,
                 byte[] lookup, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
-                where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 if (currentStateId != initialStateId)
                 {
@@ -1857,10 +1766,9 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
         private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+            public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher,
                 byte[] lookup,
                 ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
-                where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 if (currentStateId != initialStateId)
                     return false;
@@ -1868,7 +1776,8 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     currentStateId = matcher._dotstarredInitialStates[
-                        matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1]
+                        matcher._positionKinds[
+                            OptimizedSmallInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1]
                     ].Id;
                     return true;
                 }
@@ -1883,10 +1792,9 @@ public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRe
         private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
+            public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher,
                 byte[] lookup,
                 ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
-                where TOptimizedInputReader : struct, IOptimizedInputReader
             {
                 return false;
             }
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
index 7192b70cec451..c14e5e366e53b 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs
@@ -253,19 +253,5 @@ public void SafeThresholdConfigTest(object? newThresholdData, int expectedThresh
             AppContext.SetData(SymbolicRegexThresholds.SymbolicRegexSafeSizeThreshold_ConfigKeyName, null);
             Assert.Equal(expectedThreshold, k);
         }
-
-        [Fact]
-        public static void OptimizedReversalTests()
-        {
-            var charSetSolver = new CharSetSolver();
-            var bddBuilder = new SymbolicRegexBuilder<BDD>(charSetSolver, charSetSolver);
-            var converter = new RegexNodeConverter(bddBuilder, null);
-            const RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture;
-            RegexNode tree = RegexParser.Parse("abc.*def", options, CultureInfo.CurrentCulture).Root;
-            SymbolicRegexNode<BDD> rootNode = converter.ConvertToSymbolicRegexNode(tree);
-            // todo: import the matcher here or use something else?
-            // var matcher = SymbolicRegexMatcher.Create(bddBuilder, rootNode, 0, null, TimeSpan.MaxValue);
-
-        }
     }
 }

From 2e57d428f0aa699805cb88290f0d44fb647c8da8 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 02:39:11 +0300
Subject: [PATCH 51/63] state flag values down

---
 .../System/Text/RegularExpressions/Symbolic/StateFlags.cs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
index a342aff09b6b8..b446fecdca28f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
@@ -1,8 +1,6 @@
 ﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-using System.Runtime.CompilerServices;
-
 namespace System.Text.RegularExpressions.Symbolic
 {
     /// <summary>
@@ -17,9 +15,9 @@ internal enum StateFlags : byte
     {
         None = 0,
         IsInitialFlag = 1,
-        IsNullableFlag = 4,
-        CanBeNullableFlag = 8,
-        SimulatesBacktrackingFlag = 16,
+        IsNullableFlag = 2,
+        CanBeNullableFlag = 4,
+        SimulatesBacktrackingFlag = 8,
     }
 
     /// <summary>

From 60b1352f204f1726874b8f2558c2fe54272a0bf3 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 02:45:50 +0300
Subject: [PATCH 52/63] mintermclassifier changes

---
 .../Symbolic/MintermClassifier.cs             | 22 ++++++++-----------
 .../Symbolic/SymbolicRegexMatcher.cs          |  2 +-
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 41a6c9b007593..7a1af1fb5496b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -28,27 +28,20 @@ internal sealed class MintermClassifier
         /// </summary>
         private readonly int[]? _intLookup;
 
-        /// <summary>
-        /// Maximum ordinal character for a non-0 minterm, used to conserve memory
-        /// Note: this is maximum index allowed for the lookup, the array size is _maxChar + 1
-        /// </summary>
-        private readonly int _maxChar;
-
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
         /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
         public MintermClassifier(BDD[] minterms)
         {
             Debug.Assert(minterms.Length > 0, "Requires at least");
 
-
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
                 _lookup = Array.Empty<byte>();
-                _maxChar = -1;
                 return;
             }
 
+            int _maxChar = -1;
             // attempt to save memory in common cases by allocating only up to the highest char code
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
@@ -96,13 +89,16 @@ public MintermClassifier(BDD[] minterms)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
-            if (c > _maxChar)
+            if (_intLookup is null)
             {
-                return 0;
+                byte[] lookup = _lookup!;
+                return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
+            }
+            else
+            {
+                int[] lookup = _intLookup!;
+                return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
             }
-
-            // high performance inner-loop variant uses the array directly
-            return _intLookup is null ? _lookup![c] : _intLookup[c];
         }
         /// <summary>
         /// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index d7582f2e27c2f..3a7b331125002 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -1404,7 +1404,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref Cur
 
             /// <summary>Transition function that only considers DFA state id</summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
+            internal static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
                 int mintermId, long timeoutOccursAt)
             {
                 Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0");

From 2900aadc6478ddb792d4e0b397fc8cabd69c1877 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 03:49:48 +0300
Subject: [PATCH 53/63] reversal

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 84 ++++++++++---------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index c97b5e15b6403..524073c11959b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -186,24 +186,57 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
         private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
         {
             int pos = 0;
-            SymbolicRegexNode<TSet>? current = node;
+            SymbolicRegexNode<TSet> current = node;
             bool canLoop = true;
 
+            while (canLoop)
+            {
+                (bool loop, SymbolicRegexNode<TSet> next) = current switch
+                {
+                    // Bail if it contains any anchors. (This could potentially be a very good future optimization for
+                    // anchors but there's too many edge cases to guarantee it works.
+                    // one example which fails currently: pattern: @"\By\b", input: "xy")
+                    { _info.ContainsSomeAnchor: true } => Bail(),
+
+                    // if this is reached then entire match is fixed length
+                    { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon),
+
+                    { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!),
+
+                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!),
+
+                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => AddSingleton(current),
+
+                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
+                        AddFixedLengthLoop(current),
+
+                    _ => (false, current)
+                };
+                canLoop = loop;
+                current = next;
+            }
+
+            return
+                pos <= 0 ? new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0) :
+                current == _builder.Epsilon ? new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos) :
+                new MatchReversal<TSet>(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0));
+
             // finding anchors inside pattern invalidates this optimization
-            var bail = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
+            (bool, SymbolicRegexNode<TSet>) Bail()
             {
-                pos = 0;
+                pos += 1;
+                // continue with next concat
                 return (false, node);
-            });
+            }
 
-            var addSingleton = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
+            (bool, SymbolicRegexNode<TSet>) AddSingleton(SymbolicRegexNode<TSet> concatNode)
             {
                 pos += 1;
                 // continue with next concat
                 return (true, concatNode._right!);
-            });
+            }
 
-            var addFixedLengthLoop = new Func<SymbolicRegexNode<TSet>, (bool, SymbolicRegexNode<TSet>)>(concatNode =>
+            (bool, SymbolicRegexNode<TSet>) AddFixedLengthLoop(SymbolicRegexNode<TSet> concatNode)
             {
                 SymbolicRegexNode<TSet>? loopNode = concatNode._left;
                 if (loopNode is { _lower: <= 0 })
@@ -217,53 +250,22 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
 
                         if (loopNode._lower == loopNode._upper)
                         {
-                            // the entire loop is fixed, continue
                             pos += loopNode._lower;
+                            // the entire loop is fixed, continue
                             return (true, concatNode._right!);
                         }
 
                         // subtract the fixed part of the loop
                         int loopRemainder = loopNode._upper - loopNode._lower;
-                        SymbolicRegexNode<TSet> newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
+                        SymbolicRegexNode<TSet> newLeft =
+                            _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
                         SymbolicRegexNode<TSet> newNode = _builder.CreateConcat(newLeft, concatNode._right!);
                         pos += loopNode._lower;
                         return (true, newNode);
-
                     default:
                         return (false, concatNode);
                 }
-            });
-
-            while (canLoop)
-            {
-                (bool loop, SymbolicRegexNode<TSet> next) = current switch
-                {
-                    // Bail if it contains any anchors. (This could potentially be a very good future optimization for
-                    // anchors but there's too many edge cases to guarantee it works.
-                    // one example which fails currently: pattern: @"\By\b", input: "xy")
-                    { _info.ContainsSomeAnchor: true } => bail(current),
-
-                    // if this is reached then entire match is fixed length
-                    { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon),
-
-                    { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!),
-
-                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!),
-
-                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => addSingleton(current),
-
-                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => addFixedLengthLoop(current),
-
-                    _ => (false, current)
-                };
-                canLoop = loop;
-                current = next;
             }
-
-                return
-                    pos <= 0 ? new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0) :
-                    current == _builder.Epsilon ? new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos) :
-                    new MatchReversal<TSet>(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0));
         }
 
         /// <summary>

From 764ded8d4c35ba0b6d2beeee17a62bcd13e4b621 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 03:55:14 +0300
Subject: [PATCH 54/63] getstateflags

---
 .../Symbolic/SymbolicRegexMatcher.cs           | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 3a7b331125002..1d547b69239d1 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -1595,18 +1595,16 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher<
             public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
             {
                 SparseIntMap<int> stateSet = state.NfaState!.NfaStateSet;
+                // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
+                // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
+                // they are true for any state in the set; SimulatesBacktracking is true for all the states if
+                // it is true for any state (since it is a phase-wide property); and all other flags are masked out.
+                StateFlags flags = 0;
+                foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(stateSet.Values))
                 {
-                    // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
-                    // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
-                    // they are true for any state in the set; SimulatesBacktracking is true for all the states if
-                    // it is true for any state (since it is a phase-wide property); and all other flags are masked out.
-                    StateFlags flags = 0;
-                    foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(stateSet.Values))
-                    {
-                        flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)];
-                    }
-                    return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag);
+                    flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)];
                 }
+                return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag);
             }
 
 #if DEBUG

From 81d0dca2be560ee310f9fbc4e24228cbdd86a7be Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 03:56:17 +0300
Subject: [PATCH 55/63] formatting

---
 .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 1d547b69239d1..27646a170bb5b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -466,7 +466,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
 
                     }
                     matchStart = matchEnd < startat
-                        ? startat
+                    ? startat
                     : (_containsEndZAnchor, _containsAnyAnchor) switch
                     {
                         (true, true) =>

From 38f28b9ada0504ac8e6504ce1876496b19847011 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 04:33:23 +0300
Subject: [PATCH 56/63] removing unused interface

---
 .../Symbolic/SymbolicRegexMatcher.cs                 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 27646a170bb5b..b80314c742840 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -385,16 +385,16 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
                 matchEnd = (_findOpts is not null, _containsAnyAnchor) switch
                 {
                     (true, true) =>
-                        FindEndPositionOptimized<OptimizedSmallInputReader, AcceleratedStateHandler,
+                        FindEndPositionOptimized<AcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                     (true, false) =>
-                        FindEndPositionOptimized<OptimizedSmallInputReader, NoAnchorAcceleratedStateHandler,
+                        FindEndPositionOptimized<NoAnchorAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                     (false, false) =>
-                        FindEndPositionOptimized<OptimizedSmallInputReader, NoAcceleratedStateHandler,
+                        FindEndPositionOptimized<NoAcceleratedStateHandler,
                             NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                     (false, true) =>
-                        FindEndPositionOptimized<OptimizedSmallInputReader, NoAcceleratedStateHandler,
+                        FindEndPositionOptimized<NoAcceleratedStateHandler,
                             AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                 };
             }
@@ -508,7 +508,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
         /// <summary>
         /// Streamlined version of <see cref="FindEndPositionFallback"/> that doesn't handle /z anchors or very large sets of minterms.
         /// </summary>
-        private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
+        private int FindEndPositionOptimized<TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
             ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
@@ -1641,7 +1641,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
                 Debug.Assert(pos < input.Length, "pos < input.Length");
                 Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}");
                 char c = input[pos];
-                return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
+                return c < (uint)lookup.Length ? lookup[c] : 0;
             }
         }
 

From cce11887f7ba39685851752ae78bf17ab35fc950 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Thu, 4 Jul 2024 16:27:42 +0300
Subject: [PATCH 57/63] local function typo

---
 .../Symbolic/SymbolicRegexMatcher.Automata.cs                | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 524073c11959b..306704994c3de 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -216,6 +216,7 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
                 current = next;
             }
 
+
             return
                 pos <= 0 ? new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0) :
                 current == _builder.Epsilon ? new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos) :
@@ -224,8 +225,8 @@ private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node
             // finding anchors inside pattern invalidates this optimization
             (bool, SymbolicRegexNode<TSet>) Bail()
             {
-                pos += 1;
-                // continue with next concat
+                pos = 0;
+                // return original node
                 return (false, node);
             }
 

From 8b946da4b2e95e26604a358e369e91b5359c4025 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Fri, 5 Jul 2024 16:42:27 +0300
Subject: [PATCH 58/63] temporarily removing minterms test

---
 .../FunctionalTests/Regex.Match.Tests.cs      | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 993879441bb28..e272942632aa7 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2654,36 +2654,36 @@ public static IEnumerable<object[]> MatchWordsInAnchoredRegexes_TestData()
             }
         }
 #if NET
-        [Fact]
-        public async Task MatchNonBacktrackingOver255Minterms()
-        {
-            // This is a test for the rare over 255 unique minterms case in MintermClassifier
-            StringBuilder pattern = new();
-            StringBuilder input = new();
-            for (int i = 256; i <= 768; i++)
-            {
-                string str = new Rune(i).ToString();
-                pattern.Append(str);
-                // adding an optional char as well just so it's not a string literal
-                pattern.Append(str);
-                pattern.Append('?');
-                // input is the pattern itself
-                input.Append(str);
-            }
-
-            // just so it's not allocated multiple times
-            string patternString = pattern.ToString();
-            string inputString = input.ToString();
-
-            foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
-            {
-                Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None);
-                MatchCollection ms = r.Matches(inputString);
-                Assert.Equal(1, ms.Count);
-                Assert.Equal(0, ms[0].Index);
-                Assert.Equal(513, ms[0].Length);
-            }
-        }
+        // [Fact]
+        // public async Task MatchNonBacktrackingOver255Minterms()
+        // {
+        //     // This is a test for the rare over 255 unique minterms case in MintermClassifier
+        //     StringBuilder pattern = new();
+        //     StringBuilder input = new();
+        //     for (int i = 256; i <= 768; i++)
+        //     {
+        //         string str = new Rune(i).ToString();
+        //         pattern.Append(str);
+        //         // adding an optional char as well just so it's not a string literal
+        //         pattern.Append(str);
+        //         pattern.Append('?');
+        //         // input is the pattern itself
+        //         input.Append(str);
+        //     }
+        //
+        //     // just so it's not allocated multiple times
+        //     string patternString = pattern.ToString();
+        //     string inputString = input.ToString();
+        //
+        //     foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+        //     {
+        //         Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None);
+        //         MatchCollection ms = r.Matches(inputString);
+        //         Assert.Equal(1, ms.Count);
+        //         Assert.Equal(0, ms[0].Index);
+        //         Assert.Equal(513, ms[0].Length);
+        //     }
+        // }
 #endif
     }
 }

From d3430b3d4be400e56d46e7e6ec3714a4c7dd797a Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Sat, 6 Jul 2024 11:07:30 +0300
Subject: [PATCH 59/63] re-adding minterms test

---
 .../FunctionalTests/Regex.Match.Tests.cs      | 59 +++++++++----------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index e272942632aa7..cdfa9e5d33113 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2654,36 +2654,35 @@ public static IEnumerable<object[]> MatchWordsInAnchoredRegexes_TestData()
             }
         }
 #if NET
-        // [Fact]
-        // public async Task MatchNonBacktrackingOver255Minterms()
-        // {
-        //     // This is a test for the rare over 255 unique minterms case in MintermClassifier
-        //     StringBuilder pattern = new();
-        //     StringBuilder input = new();
-        //     for (int i = 256; i <= 768; i++)
-        //     {
-        //         string str = new Rune(i).ToString();
-        //         pattern.Append(str);
-        //         // adding an optional char as well just so it's not a string literal
-        //         pattern.Append(str);
-        //         pattern.Append('?');
-        //         // input is the pattern itself
-        //         input.Append(str);
-        //     }
-        //
-        //     // just so it's not allocated multiple times
-        //     string patternString = pattern.ToString();
-        //     string inputString = input.ToString();
-        //
-        //     foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
-        //     {
-        //         Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None);
-        //         MatchCollection ms = r.Matches(inputString);
-        //         Assert.Equal(1, ms.Count);
-        //         Assert.Equal(0, ms[0].Index);
-        //         Assert.Equal(513, ms[0].Length);
-        //     }
-        // }
+        [Fact]
+        public async Task MatchNonBacktrackingOver255Minterms()
+        {
+            // This is a test for the rare over 255 unique minterms case in MintermClassifier
+            StringBuilder pattern = new();
+            StringBuilder input = new();
+            for (int i = 128; i <= 500; i++)
+            {
+                char c = (char)i;
+                pattern.Append(c);
+                // adding an optional char as well just so it's not a string literal
+                pattern.Append(c);
+                pattern.Append('?');
+                // input is the pattern itself
+                input.Append(c);
+            }
+        
+            string patternString = pattern.ToString();
+            string inputString = input.ToString();
+        
+            // foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+            // {
+            Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None);
+            MatchCollection ms = r.Matches(inputString);
+            Assert.Equal(1, ms.Count);
+            Assert.Equal(0, ms[0].Index);
+            Assert.Equal(373, ms[0].Length);
+            // }
+        }
 #endif
     }
 }

From 388c256331a91fb4c787fc7c4213788b06c5db1d Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 8 Jul 2024 22:34:05 +0300
Subject: [PATCH 60/63] reenabling test for all engines

---
 .../FunctionalTests/Regex.Match.Tests.cs      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index cdfa9e5d33113..2231062b5af57 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2670,18 +2670,18 @@ public async Task MatchNonBacktrackingOver255Minterms()
                 // input is the pattern itself
                 input.Append(c);
             }
-        
+
             string patternString = pattern.ToString();
             string inputString = input.ToString();
-        
-            // foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
-            // {
-            Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None);
-            MatchCollection ms = r.Matches(inputString);
-            Assert.Equal(1, ms.Count);
-            Assert.Equal(0, ms[0].Index);
-            Assert.Equal(373, ms[0].Length);
-            // }
+
+            foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
+            {
+                Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None);
+                MatchCollection ms = r.Matches(inputString);
+                Assert.Equal(1, ms.Count);
+                Assert.Equal(0, ms[0].Index);
+                Assert.Equal(373, ms[0].Length);
+            }
         }
 #endif
     }

From 270464102417572fff83a049b07d2919e687aca1 Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Mon, 8 Jul 2024 22:59:33 +0300
Subject: [PATCH 61/63] test bugfix

---
 .../tests/FunctionalTests/Regex.Match.Tests.cs                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 2231062b5af57..9e6db7976433b 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2660,7 +2660,7 @@ public async Task MatchNonBacktrackingOver255Minterms()
             // This is a test for the rare over 255 unique minterms case in MintermClassifier
             StringBuilder pattern = new();
             StringBuilder input = new();
-            for (int i = 128; i <= 500; i++)
+            for (int i = 128; i <= 400; i++)
             {
                 char c = (char)i;
                 pattern.Append(c);
@@ -2676,7 +2676,7 @@ public async Task MatchNonBacktrackingOver255Minterms()
 
             foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
             {
-                Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None);
+                Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None);
                 MatchCollection ms = r.Matches(inputString);
                 Assert.Equal(1, ms.Count);
                 Assert.Equal(0, ms[0].Index);

From 0abaabee87b9e9dcce6c55ae3901a0fa79ecc88d Mon Sep 17 00:00:00 2001
From: ieviev <ieviev@users.noreply.github.com>
Date: Tue, 9 Jul 2024 00:00:04 +0300
Subject: [PATCH 62/63] expected matches change

---
 .../tests/FunctionalTests/Regex.Match.Tests.cs                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 9e6db7976433b..6ad2275f9584b 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2680,7 +2680,7 @@ public async Task MatchNonBacktrackingOver255Minterms()
                 MatchCollection ms = r.Matches(inputString);
                 Assert.Equal(1, ms.Count);
                 Assert.Equal(0, ms[0].Index);
-                Assert.Equal(373, ms[0].Length);
+                Assert.Equal(273, ms[0].Length);
             }
         }
 #endif

From 0a0f40982f089ca835306f37efc8eb5520e38f6e Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Wed, 10 Jul 2024 15:01:01 -0400
Subject: [PATCH 63/63] Review and clean up some code

Simplification, style consistency, dead code deletion, some bounds-check removal, etc.
---
 .../RegularExpressions/RegexReplacement.cs    |   1 -
 .../Symbolic/MatchReversal.cs                 |  42 +-
 .../Symbolic/MatchReversalKind.cs             |  30 +-
 .../Symbolic/MatchingState.cs                 |  44 +-
 .../Symbolic/MintermClassifier.cs             |  82 ++--
 .../Symbolic/SymbolicRegexMatcher.Automata.cs | 169 +++----
 .../Symbolic/SymbolicRegexMatcher.cs          | 443 ++++++++----------
 .../Symbolic/SymbolicRegexNode.cs             |  37 +-
 .../Symbolic/SymbolicRegexThresholds.cs       |  11 +-
 .../FunctionalTests/Regex.Match.Tests.cs      |  27 +-
 10 files changed, 410 insertions(+), 476 deletions(-)

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
index 49205f5ee2649..d2aec2621a81c 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
@@ -5,7 +5,6 @@
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 
 #pragma warning disable CS8500 // takes address of managed type
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
index cd00755dbe6dc..2ea1ea8af7422 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
@@ -1,15 +1,39 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-namespace System.Text.RegularExpressions.Symbolic;
+using System.Diagnostics;
 
-internal readonly struct MatchReversal<TSet>(
-    MatchReversalKind kind,
-    int fixedLength,
-    MatchingState<TSet>? adjustedStartState = null)
-    where TSet : IComparable<TSet>, IEquatable<TSet>
+namespace System.Text.RegularExpressions.Symbolic
 {
-    internal MatchReversalKind Kind { get; } = kind;
-    internal int FixedLength { get; } = fixedLength;
-    internal MatchingState<TSet>? AdjustedStartState { get; } = adjustedStartState;
+    /// <summary>Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.</summary>
+    internal readonly struct MatchReversalInfo<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
+    {
+        /// <summary>Initializes the match reversal details.</summary>
+        internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
+        {
+            Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
+            Debug.Assert(fixedLength >= 0);
+            Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));
+
+            Kind = kind;
+            FixedLength = fixedLength;
+            AdjustedStartState = adjustedStartState;
+        }
+
+        /// <summary>Gets the kind of the match reversal processing required.</summary>
+        internal MatchReversalKind Kind { get; }
+
+        /// <summary>Gets the fixed length of the match, if one is known.</summary>
+        /// <remarks>
+        /// For <see cref="MatchReversalKind.MatchStart"/>, this is ignored.
+        /// For <see cref="MatchReversalKind.FixedLength"/>, this is the full length of the match. The beginning may be found simply
+        /// by subtracting this length from the end.
+        /// For <see cref="MatchReversalKind.PartialFixedLength"/>, this is the length of fixed portion of the match.
+        /// </remarks>
+        internal int FixedLength { get; }
+
+        /// <summary>Gets the adjusted start state to use for partial fixed-length matches.</summary>
+        /// <remarks>This will be non-null iff <see cref="Kind"/> is <see cref="MatchReversalKind.PartialFixedLength"/>.</remarks>
+        internal MatchingState<TSet>? AdjustedStartState { get; }
+    }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
index d498e4dd7eb99..a949e6204a16a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
@@ -1,14 +1,26 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-namespace System.Text.RegularExpressions.Symbolic;
-
-internal enum MatchReversalKind
+namespace System.Text.RegularExpressions.Symbolic
 {
-    /// <summary>The most generic option, run the regex backwards to find beginning of match</summary>
-    MatchStart,
-    /// <summary>Part of the reversal is fixed length and can be skipped</summary>
-    PartialFixedLength,
-    /// <summary>The entire pattern is fixed length, reversal not necessary</summary>
-    FixedLength
+    /// <summary>Specifies the kind of a <see cref="MatchReversalInfo{TSet}"/>.</summary>
+    internal enum MatchReversalKind
+    {
+        /// <summary>The regex should be run in reverse to find beginning of the match.</summary>
+        MatchStart,
+
+        /// <summary>The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.</summary>
+        /// <remarks>
+        /// Reverse execution is not necessary for a subset of the match.
+        /// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the fixed portion.
+        /// </remarks>
+        PartialFixedLength,
+
+        /// <summary>The entire pattern is of a fixed length.</summary>
+        /// <remarks>
+        /// Reverse execution is not necessary to find the beginning of the match.
+        /// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the match.
+        /// </remarks>
+        FixedLength
+    }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index 405be0318bbd5..3aacc4a61cbb9 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -17,8 +17,6 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
             NullabilityInfo = BuildNullabilityInfo();
         }
 
-        internal int NullabilityInfo { get; }
-
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
 
@@ -98,15 +96,31 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
             return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
         }
 
-        /// <summary>
-        /// Cached nullability check with encoded bits
-        /// </summary>
+        /// <summary>Determines whether the node is nullable for the given context.</summary>
+        /// <remarks>
+        /// This is functionally equivalent to <see cref="SymbolicRegexNode{TSet}.IsNullableFor(uint)"/>, but using cached
+        /// answers stored in <see cref="NullabilityInfo"/>.
+        /// </remarks>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
         {
-            return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
+            Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
+            return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
         }
 
+        /// <summary>Gets the nullability info for the matching state.</summary>
+        /// <remarks>
+        /// <list>
+        /// <item>00000 -> node cannot be nullable</item>
+        /// <item>00001 -> nullable for General</item>
+        /// <item>00010 -> nullable for BeginningEnd</item>
+        /// <item>00100 -> nullable for NewLine</item>
+        /// <item>01000 -> nullable for NewLineS</item>
+        /// <item>10000 -> nullable for WordLetter</item>
+        /// </list>
+        /// </remarks>
+        internal int NullabilityInfo { get; }
+
         /// <summary>
         /// Builds a <see cref="StateFlags"/> with the relevant flags set.
         /// </summary>
@@ -138,24 +152,16 @@ internal StateFlags BuildStateFlags(bool isInitial)
             return info;
         }
 
-        /// <summary>
-        /// Builds the nullability information for the matching state.
-        /// Nullability for each context is encoded in a bit
-        /// 0 means node cannot be nullable
-        /// 00001 -> nullable for General
-        /// 00010 -> nullable for BeginningEnd
-        /// 00100 -> nullable for NewLine
-        /// 01000 -> nullable for NewLineS
-        /// 10000 -> nullable for WordLetter
-        /// </summary>
-        internal byte BuildNullabilityInfo()
+        /// <summary>Builds the nullability information for the matching state.</summary>
+        /// <remarks>Nullability for each context is encoded in a bit. See <see cref="NullabilityInfo"/>.</remarks>
+        private byte BuildNullabilityInfo()
         {
             byte nullabilityInfo = 0;
             if (Node.CanBeNullable)
             {
-                for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
+                for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
                 {
-                    nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0);
+                    nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
                 }
             }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index 7a1af1fb5496b..24d2a26f84922 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -1,7 +1,9 @@
 ﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Buffers;
 using System.Diagnostics;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 
 namespace System.Text.RegularExpressions.Symbolic
@@ -20,12 +22,12 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </remarks>
     internal sealed class MintermClassifier
     {
-        /// <summary>An array used to map characters to minterms</summary>
+        /// <summary>Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.</summary>
+        /// <remarks>_lookup[char] provides the minterm ID. If char &gt;= _lookup.Length, its minterm is 0.</remarks>
         private readonly byte[]? _lookup;
 
-        /// <summary>
-        /// Fallback lookup if over 255 minterms. This is rarely used.
-        /// </summary>
+        /// <summary>Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.</summary>
+        /// <remarks>_intLookup[char] provides the minterm ID. If char &gt;= _intLookup.Length, its minterm is 0.</remarks>
         private readonly int[]? _intLookup;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
@@ -37,51 +39,54 @@ public MintermClassifier(BDD[] minterms)
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
-                _lookup = Array.Empty<byte>();
+                _lookup = [];
                 return;
             }
 
-            int _maxChar = -1;
-            // attempt to save memory in common cases by allocating only up to the highest char code
+            // Compute all minterm ranges. We do this here in order to determine the maximum character value
+            // in order to size the lookup array to minimize steady-state memory consumption of the potentially
+            // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
+            // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
+            // However, when there are more than 255 minterms, we need to use int[] _intLookup.
+            (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
+
+            int maxChar = -1;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
-                _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
+                (uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                charRangesPerMinterm[mintermId] = ranges;
+                maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
             }
 
-            // It's incredibly rare for a regex to use more than a hundred or two minterms,
-            // but we need a fallback just in case.
+            // It's incredibly rare for a regex to use more than a couple hundred minterms,
+            // but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
             if (minterms.Length > 255)
             {
-                // over 255 unique sets also means it's never ascii only
-                int[] lookup = new int[_maxChar + 1];
-                for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
-                {
-                    // precompute all assigned minterm categories
-                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                    foreach ((uint start, uint end) in mintermRanges)
-                    {
-                        // assign character ranges in bulk
-                        Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                        slice.Fill(mintermId);
-                    }
-                }
-                _intLookup = lookup;
+                _intLookup = CreateLookup<int>(minterms, charRangesPerMinterm, maxChar);
             }
             else
             {
-                byte[] lookup = new byte[_maxChar + 1];
+                _lookup = CreateLookup<byte>(minterms, charRangesPerMinterm, maxChar);
+            }
+
+            // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
+            Array.Clear(charRangesPerMinterm, 0, minterms.Length);
+            ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
+
+            // Creates the lookup array.
+            static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
+            {
+                T[] lookup = new T[_maxChar + 1];
                 for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
-                    // precompute all assigned minterm categories
-                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                    foreach ((uint start, uint end) in mintermRanges)
+                    // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
+                    foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
                     {
-                        // assign character ranges in bulk
-                        Span<byte> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                        slice.Fill((byte)mintermId);
+                        lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
                     }
                 }
-                _lookup = lookup;
+
+                return lookup;
             }
         }
 
@@ -89,9 +94,9 @@ public MintermClassifier(BDD[] minterms)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
-            if (_intLookup is null)
+            if (_lookup is not null)
             {
-                byte[] lookup = _lookup!;
+                byte[] lookup = _lookup;
                 return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
             }
             else
@@ -104,20 +109,17 @@ public int GetMintermID(int c)
         /// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
         /// Null if there are greater than 255 minterms.
         /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public byte[]? ByteLookup() => _lookup;
+        public byte[]? ByteLookup => _lookup;
 
         /// <summary>
         /// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
         /// Null in the common case where there are fewer than 255 minterms.
         /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public int[]? IntLookup() => _intLookup;
+        public int[]? IntLookup => _intLookup;
 
         /// <summary>
         /// Maximum ordinal character for a non-0 minterm, used to conserve memory
         /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1;
+        public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
     }
 }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 306704994c3de..327f5666f9e2a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -26,7 +26,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// Cache for the states that have been created. Each state is uniquely identified by its associated
         /// <see cref="SymbolicRegexNode{TSet}"/> and the kind of the previous character.
         /// </summary>
-        private readonly Dictionary<(SymbolicRegexNode<TSet> Node, uint PrevCharKind), MatchingState<TSet>> _stateCache = new();
+        private readonly Dictionary<(SymbolicRegexNode<TSet> Node, uint PrevCharKind), MatchingState<TSet>> _stateCache = [];
 
         /// <summary>
         /// Maps state ids to states, initial capacity is given by <see cref="InitialDfaStateCapacity"/>.
@@ -41,20 +41,14 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// </summary>
         private StateFlags[] _stateFlagsArray;
 
-        /// <summary>
-        /// important: the pattern must not contain endZ for this to be valid.
-        /// Used to short-circuit nullability in the hot loop
-        /// nullability for each context is encoded in a bit
-        /// 0 means node cannot be nullable
-        /// 00001 -> nullable for General
-        /// 00010 -> nullable for BeginningEnd
-        /// 00100 -> nullable for NewLine
-        /// 01000 -> nullable for NewLineS
-        /// 10000 -> nullable for WordLetter
-        /// </summary>
+        /// <summary>Cached nullability info for each state ID.</summary>
+        /// <remarks>
+        /// _nullabilityArray[stateId] == the <see cref="MatchingState{TSet}.NullabilityInfo"/> for that state.
+        /// Used to short-circuit nullability in the hot loop.
+        /// Important: the pattern must not contain endZ for this to be valid.
+        /// </remarks>
         private byte[] _nullabilityArray;
 
-
         /// <summary>
         /// The transition function for DFA mode.
         /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
@@ -84,7 +78,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
         /// It is the inverse of used entries in _nfaStateArray.
         /// The range of this map is 0 to its size - 1.
         /// </summary>
-        private readonly Dictionary<int, int> _nfaIdByCoreId = new();
+        private readonly Dictionary<int, int> _nfaIdByCoreId = [];
 
         /// <summary>
         /// Transition function for NFA transitions in NFA mode.
@@ -127,7 +121,7 @@ private static void ArrayResizeAndVolatilePublish<T>(ref T[] array, int newSize)
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private bool IsNullableWithContext(int stateId, int mintermId) =>
-            ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0;
+            (_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0;
 
         /// <summary>Returns the span from <see cref="_dfaDelta"/> that may contain transitions for the given state</summary>
         private Span<int> GetDeltasFor(MatchingState<TSet> state)
@@ -175,98 +169,75 @@ private MatchingState<TSet> GetOrCreateState(SymbolicRegexNode<TSet> node, uint
         }
 
         /// <summary>
-        /// Optimized reversal state computation during construction which
-        /// skips the fixed length parts of reversal
-        /// e.g. for the pattern abc.*def
+        /// Analyze the specified reversed pattern to gather details that help to optimize the reverse matching process
+        /// for when finding the beginning of a match.
+        /// </summary>
+        /// <remarks>
+        /// Optimized reversal state computation during construction which skips the fixed length suffix, e.g. for the pattern abc.*def
         /// 1) the end is found at abc.*def|
         /// 2) the reversal starts at abc.*|
-        /// </summary>
-        /// <param name="node">reversed initial pattern</param>
-        /// <returns>returns num of chars to skip and adjusted reversal start state</returns>
-        private MatchReversal<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
+        /// </remarks>
+        /// <param name="node">Reversed initial pattern</param>
+        /// <returns>The match reversal details.</returns>
+        private MatchReversalInfo<TSet> CreateOptimizedReversal(SymbolicRegexNode<TSet> node)
         {
             int pos = 0;
-            SymbolicRegexNode<TSet> current = node;
-            bool canLoop = true;
-
-            while (canLoop)
+            while (true)
             {
-                (bool loop, SymbolicRegexNode<TSet> next) = current switch
+                if (node._info.ContainsSomeAnchor)
                 {
-                    // Bail if it contains any anchors. (This could potentially be a very good future optimization for
-                    // anchors but there's too many edge cases to guarantee it works.
-                    // one example which fails currently: pattern: @"\By\b", input: "xy")
-                    { _info.ContainsSomeAnchor: true } => Bail(),
-
-                    // if this is reached then entire match is fixed length
-                    { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon),
-
-                    { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!),
-
-                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!),
-
-                    {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => AddSingleton(current),
-
-                    {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } =>
-                        AddFixedLengthLoop(current),
-
-                    _ => (false, current)
-                };
-                canLoop = loop;
-                current = next;
-            }
-
-
-            return
-                pos <= 0 ? new MatchReversal<TSet>(MatchReversalKind.MatchStart, 0) :
-                current == _builder.Epsilon ? new MatchReversal<TSet>(MatchReversalKind.FixedLength, pos) :
-                new MatchReversal<TSet>(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0));
+                    // Bail if it contains any anchors as it invalidates the optimization.
+                    // (This could potentially be a very good future optimization for anchors but there's too many edge cases to guarantee it works.
+                    // One example which fails currently: pattern: @"\By\b", input: "xy")
+                    pos = 0;
+                    break;
+                }
 
-            // finding anchors inside pattern invalidates this optimization
-            (bool, SymbolicRegexNode<TSet>) Bail()
-            {
-                pos = 0;
-                // return original node
-                return (false, node);
-            }
+                if (node._kind is not SymbolicRegexNodeKind.Concat)
+                {
+                    if (node._kind is SymbolicRegexNodeKind.CaptureStart)
+                    {
+                        node = _builder.Epsilon; // The entire match is fixed length.
+                    }
+                    break;
+                }
 
-            (bool, SymbolicRegexNode<TSet>) AddSingleton(SymbolicRegexNode<TSet> concatNode)
-            {
-                pos += 1;
-                // continue with next concat
-                return (true, concatNode._right!);
-            }
+                SymbolicRegexNode<TSet>? left = node._left;
+                Debug.Assert(left is not null);
 
-            (bool, SymbolicRegexNode<TSet>) AddFixedLengthLoop(SymbolicRegexNode<TSet> concatNode)
-            {
-                SymbolicRegexNode<TSet>? loopNode = concatNode._left;
-                if (loopNode is { _lower: <= 0 })
+                if (left._kind is SymbolicRegexNodeKind.CaptureEnd or SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.Singleton)
                 {
-                    return (false, concatNode);
+                    node = node._right!;
+                    if (left._kind is SymbolicRegexNodeKind.Singleton)
+                    {
+                        pos++;
+                    }
                 }
+                else if (left._kind is SymbolicRegexNodeKind.Loop)
+                {
+                    if (left._lower <= 0 || left._left!.Kind is not SymbolicRegexNodeKind.Singleton)
+                    {
+                        break;
+                    }
 
-                switch (loopNode!._left!.Kind)
+                    node = left._lower == left._upper ?
+                        node._right! : // The entire loop is fixed
+                        _builder.CreateConcat( // Subtract the fixed part of the loop.
+                            _builder.CreateLoop(left._left, left.IsLazy, 0, left._upper - left._lower),
+                            node._right!);
+                    pos += left._lower;
+                }
+                else
                 {
-                    case SymbolicRegexNodeKind.Singleton:
-
-                        if (loopNode._lower == loopNode._upper)
-                        {
-                            pos += loopNode._lower;
-                            // the entire loop is fixed, continue
-                            return (true, concatNode._right!);
-                        }
-
-                        // subtract the fixed part of the loop
-                        int loopRemainder = loopNode._upper - loopNode._lower;
-                        SymbolicRegexNode<TSet> newLeft =
-                            _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder);
-                        SymbolicRegexNode<TSet> newNode = _builder.CreateConcat(newLeft, concatNode._right!);
-                        pos += loopNode._lower;
-                        return (true, newNode);
-                    default:
-                        return (false, concatNode);
+                    break;
                 }
             }
+
+            Debug.Assert(pos >= 0);
+            return
+                pos == 0 ? new MatchReversalInfo<TSet>(MatchReversalKind.MatchStart, 0) :
+                node == _builder.Epsilon ? new MatchReversalInfo<TSet>(MatchReversalKind.FixedLength, pos) :
+                new MatchReversalInfo<TSet>(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(node), 0));
         }
 
         /// <summary>
@@ -299,7 +270,7 @@ private MatchingState<TSet> GetOrCreateState_NoLock(SymbolicRegexNode<TSet> node
                 }
                 _stateArray[state.Id] = state;
                 _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState);
-                _nullabilityArray[state.Id] = state.BuildNullabilityInfo();
+                _nullabilityArray[state.Id] = (byte)state.NullabilityInfo;
             }
 
             return state;
@@ -395,11 +366,8 @@ private bool TryCreateNewTransition(
                 MatchingState<TSet>? targetState = _stateArray[_dfaDelta[offset]];
                 if (targetState is null)
                 {
-                    if (// check if there is an active timer
-                        (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) ||
-                        // check if amount of nodes exceeds the NFA threshold
-                        (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)
-                    )
+                    if ((timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || // if there's an active timer
+                        (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)) // if # of nodes exceeds the NFA threshold
                     {
                         nextState = null;
                         return false;
@@ -438,7 +406,7 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse
                     SymbolicRegexNode<TSet> targetNode = coreTargetId > 0 ?
                         GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind);
 
-                    List<int> targetsList = new();
+                    List<int> targetsList = [];
                     ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List<int> targetsList) =>
                         targetsList.Add(nfaId));
 
@@ -465,8 +433,9 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse
                     TSet minterm = GetMintermFromId(mintermId);
                     uint nextCharKind = GetPositionKind(mintermId);
                     List<(SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind);
+
                     // Build the new state and store it into the array.
-                    List<(int, DerivativeEffect[])> targetsList = new();
+                    List<(int, DerivativeEffect[])> targetsList = [];
                     foreach ((SymbolicRegexNode<TSet> Node, DerivativeEffect[] Effects) entry in transition)
                     {
                         ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects),
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index b80314c742840..08f423b03344a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -1,7 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-using System.Buffers;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
@@ -84,17 +83,16 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
 
         /// <summary>
         /// Dead end state to quickly return NoMatch.
-        /// This could potentially be a constant if it's the very first state created
         /// </summary>
         private readonly int _deadStateId;
 
-        /// <summary>Initial state used for vectorization</summary>
+        /// <summary>Initial state used for vectorization.</summary>
         private readonly int _initialStateId;
 
-        /// <summary>Whether the pattern contains any anchor</summary>
+        /// <summary>Whether the pattern contains any anchor.</summary>
         private readonly bool _containsAnyAnchor;
 
-        /// <summary>Whether the pattern contains the EndZ anchor, which makes most optimization shortcuts invalid</summary>
+        /// <summary>Whether the pattern contains the EndZ anchor, which invalidates most optimization shortcuts.</summary>
         private readonly bool _containsEndZAnchor;
 
         /// <summary>The initial states for the original pattern, keyed off of the previous character kind.</summary>
@@ -109,10 +107,8 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _reverseInitialStates;
 
-        /// <summary>
-        /// Reversal state which skips fixed length parts.
-        /// </summary>
-        private readonly MatchReversal<TSet> _optimizedReversalState;
+        /// <summary>Details on optimized processing of the reverse of the pattern to find the beginning of a match.</summary>
+        private readonly MatchReversalInfo<TSet> _optimizedReversalInfo;
 
         /// <summary>Partition of the input space of sets.</summary>
         private readonly TSet[] _minterms;
@@ -190,8 +186,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
                 ((BitVectorSolver)(object)builder._solver)._classifier;
             _capsize = captureCount;
 
-
-            // Initialization for fields in SymbolicRegexMatcher.Automata.cs
+            // Initialize state and nullability arrays.
             _stateArray = new MatchingState<TSet>[InitialDfaStateCapacity];
             _stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
             _nullabilityArray = new byte[InitialDfaStateCapacity];
@@ -206,8 +201,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
                 _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId);
             }
 
-            // Create optimized reversal
-            _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder));
+            // Gather optimized reversal processing information.
+            _optimizedReversalInfo = CreateOptimizedReversal(_pattern.Reverse(builder));
 
             // Store the find optimizations that can be used to jump ahead to the next possible starting location.
             // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's
@@ -251,9 +246,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNo
             }
             _dotstarredInitialStates = dotstarredInitialStates;
 
-            // Assign dead state id
+            // Assign dead and initial state ids
             _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
-            // Assign initial state id
             _initialStateId = _dotstarredInitialStates[CharKind.General].Id;
 
             // Create the reverse pattern (the original pattern in reverse order) and all of its
@@ -378,38 +372,25 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // the position of the last b: aacaaaabbbc.  It additionally records the position of the first a after
             // the c as the low boundary for the starting position.
 
-            // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases
+            // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases.
             int matchEnd;
-            if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null))
+            if (!_containsEndZAnchor && _mintermClassifier.IntLookup is null)
             {
+                // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments.
                 matchEnd = (_findOpts is not null, _containsAnyAnchor) switch
                 {
-                    (true, true) =>
-                        FindEndPositionOptimized<AcceleratedStateHandler,
-                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (true, false) =>
-                        FindEndPositionOptimized<NoAnchorAcceleratedStateHandler,
-                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, false) =>
-                        FindEndPositionOptimized<NoAcceleratedStateHandler,
-                            NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
-                    (false, true) =>
-                        FindEndPositionOptimized<NoAcceleratedStateHandler,
-                            AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (true, true) =>   FindEndPositionOptimized<AcceleratedStateHandler, AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (true, false) =>  FindEndPositionOptimized<NoAnchorAcceleratedStateHandler, NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (false, false) => FindEndPositionOptimized<NoAcceleratedStateHandler, NoAnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
+                    (false, true) =>  FindEndPositionOptimized<NoAcceleratedStateHandler, AnchorOptimizedNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData),
                 };
             }
             else
             {
-                // fallback for Z anchor or over 255 minterms
-                matchEnd = (_findOpts is not null) switch
-                {
-                    true =>
-                        FindEndPositionFallback<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(
-                            input, startat, timeoutOccursAt, mode, perThreadData),
-                    false =>
-                        FindEndPositionFallback<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(
-                            input, startat, timeoutOccursAt, mode, perThreadData),
-                };
+                // Fallback for Z anchor or over 255 minterms
+                matchEnd = _findOpts is not null ?
+                    FindEndPositionFallback<FullInputReader, InitialStateFindOptimizationsHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData) :
+                    FindEndPositionFallback<FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(input, startat, timeoutOccursAt, mode, perThreadData);
             }
 
             // If there wasn't a match, we're done.
@@ -431,60 +412,57 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
             // recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that
             // exact number of positions backwards.  Continuing the previous example, phase 2 will walk backwards from
             // that last b until it finds the 4th a: aaabbbc.
-            int matchStart;
+            int matchStart = 0;
             Debug.Assert(matchEnd >= startat - 1);
-            switch (_optimizedReversalState.Kind)
+            switch (_optimizedReversalInfo.Kind)
             {
-                case MatchReversalKind.FixedLength:
-                    matchStart = (matchEnd - _optimizedReversalState.FixedLength);
-                    break;
-
                 case MatchReversalKind.MatchStart:
                 case MatchReversalKind.PartialFixedLength:
                     int initialLastStart = -1; // invalid sentinel value
                     int i = matchEnd;
                     CurrentState reversalStartState;
-                    if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength)
+
+                    if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart)
                     {
-                        i -= _optimizedReversalState.FixedLength;
-                        reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!);
+                        // No fixed-length knowledge. Start at the end of the match.
+                        reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind<FullInputReader>(input, matchEnd)]);
+                    }
+                    else
+                    {
+                        // There's a fixed-length portion at the end of the match. Start just before it.
+                        i -= _optimizedReversalInfo.FixedLength;
+                        reversalStartState = new CurrentState(_optimizedReversalInfo.AdjustedStartState!);
 
                         // reversal may already be nullable here in the case of anchors
-                        if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0)
-                        {
-                            if (FullNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
-                                in reversalStartState, FullInputReader.GetPositionId(this, input, i),
+                        if (_containsAnyAnchor &&
+                            _nullabilityArray[reversalStartState.DfaStateId] > 0 &&
+                            FullNullabilityHandler.IsNullableAt<DfaStateHandler>(
+                                this, in reversalStartState, FullInputReader.GetPositionId(this, input, i),
                                 DfaStateHandler.GetStateFlags(this, in reversalStartState)))
-                            {
-                                initialLastStart = i;
-                            }
+                        {
+                            initialLastStart = i;
                         }
                     }
-                    else
-                    {
-                        reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind<FullInputReader>(input, matchEnd)]);
 
-                    }
-                    matchStart = matchEnd < startat
-                    ? startat
-                    : (_containsEndZAnchor, _containsAnyAnchor) switch
+                    matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch
                     {
-                        (true, true) =>
-                            FindStartPosition<FullInputReader, FullNullabilityHandler>(
-                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
-                        (true, false) =>
-                            FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(
-                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
-                        (false, true) =>
-                            FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(
-                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
-                        (false, false) =>
-                            FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(
-                                reversalStartState, initialLastStart, input, i, startat, perThreadData),
+                        // Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization;
+                        // the (true, true) case is functionally complete whereas the (false, false) case is the most optimized.
+                        (true, true) =>   FindStartPosition<FullInputReader, FullNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
+                        (true, false) =>  FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
+                        (false, true) =>  FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
+                        (false, false) => FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(reversalStartState, initialLastStart, input, i, startat, perThreadData),
                     };
                     break;
+
+                case MatchReversalKind.FixedLength:
+                    // The whole match is known to be of a fixed length, so we don't need to do any processing to find its beginning, just jump there.
+                    matchStart = matchEnd - _optimizedReversalInfo.FixedLength;
+                    break;
+
                 default:
-                    throw new ArgumentOutOfRangeException();
+                    Debug.Fail($"Unexpected reversal kind: {_optimizedReversalInfo.Kind}");
+                    break;
             }
 
             // Phase 3:
@@ -513,8 +491,7 @@ private int FindEndPositionOptimized<TAcceleratedStateHandler, TOptimizedNullabi
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
-            // this initial state candidate is not really used in the common DFA case
-            // and could potentially be removed in the future
+            // Initial state candidate. (This is not used in the common DFA caseand could potentially be removed in the future.)
             int initialStatePosCandidate = pos;
             var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind<FullInputReader>(input, pos - 1)]);
             int endPos = NoMatchExists;
@@ -527,26 +504,19 @@ private int FindEndPositionOptimized<TAcceleratedStateHandler, TOptimizedNullabi
                 if (currentState.NfaState is null)
                 {
                     const int DfaCharsPerTimeoutCheck = 100_000;
-                    innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck
-                        ? pos + DfaCharsPerTimeoutCheck
-                        : lengthMinus1;
-                    done =
-                        FindEndPositionDeltasDFAOptimized<
-                            TAcceleratedStateHandler,
-                            TOptimizedNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos,
-                            ref currentState.DfaStateId, ref endPos);
+                    innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1;
+                    done = FindEndPositionDeltasDFAOptimized<TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
+                        input, innerLoopLength, mode, timeoutOccursAt, ref pos,
+                        ref currentState.DfaStateId, ref endPos);
                 }
                 else
                 {
-                    // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
-                    const int NfaCharsPerTimeoutCheck = 1000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck
-                        ? pos + NfaCharsPerTimeoutCheck
-                        : input.Length;
-                    done =
-                        FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler,
-                            FullNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
-                            ref initialStatePosCandidate, ref initialStatePosCandidate);
+                    // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here and it's not worth special-casing.
+                    const int NfaCharsPerTimeoutCheck = 1_000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length;
+                    done = FindEndPositionDeltasNFA<NfaStateHandler, FullInputReader, NoOptimizationsInitialStateHandler, FullNullabilityHandler>(
+                        input, innerLoopLength, mode, timeoutOccursAt, ref pos,
+                        ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate);
                 }
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
@@ -611,27 +581,18 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                 bool done;
                 if (currentState.NfaState is null)
                 {
-                    const int DfaCharsPerTimeoutCheck = 25000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck
-                        ? pos + DfaCharsPerTimeoutCheck
-                        : input.Length;
-                    done =
-                        FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
-                            ref endStateId, ref initialStatePosCandidate);
+                    const int DfaCharsPerTimeoutCheck = 25_000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length;
+                    done = FindEndPositionDeltasDFA<DfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(
+                        input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate);
                 }
                 else
                 {
-                    // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
-                    // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
-                    const int NfaCharsPerTimeoutCheck = 1000;
-                    innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck
-                        ? pos + NfaCharsPerTimeoutCheck
-                        : input.Length;
-                    done =
-                        FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler,
-                            TNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos,
-                            ref endStateId, ref initialStatePosCandidate);
+                    // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here.
+                    const int NfaCharsPerTimeoutCheck = 1_000;
+                    innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length;
+                    done = FindEndPositionDeltasNFA<NfaStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(
+                        input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate);
                 }
 
                 // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
@@ -665,18 +626,17 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
 
         /// <summary>
         /// This version of <see cref="FindEndPositionDeltasDFA"/> uses a different set of interfaces,
-        /// which don't check for many inner loop edge cases e.g. input end or '\n'.
+        /// which don't check for many inner loop edge cases, e.g. input end or '\n'.
         /// All edge cases are handled before entering the loop.
         /// </summary>
-        private bool FindEndPositionDeltasDFAOptimized<TAcceleratedStateHandler,
-            TOptimizedNullabilityHandler>(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
+        private bool FindEndPositionDeltasDFAOptimized<TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
+            ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
             long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef)
             where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
             where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
         {
-            // initial check for input end to get it out of the loop
+            // Initial check for input end lifted out of the subsequent hot-path loop.
             if (posRef == input.Length)
-
             {
                 if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0]))
                 {
@@ -688,12 +648,12 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
 
             // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
             int pos = posRef;
-            int endPos = endPosRef;
-            byte[] mtlookup = _mintermClassifier.ByteLookup()!;
             int currStateId = currentStateIdRef;
+            int endPos = endPosRef;
+
+            byte[] mtlookup = _mintermClassifier.ByteLookup!;
             int deadStateId = _deadStateId;
             int initialStateId = _initialStateId;
-            int maxChar = _mintermClassifier.MaxChar();
             try
             {
                 // The goal is to make this loop as fast as it can possibly be,
@@ -705,8 +665,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                         return true;
                     }
 
-                    if (TAcceleratedStateHandler.TryFindNextStartingPosition(
-                        this, mtlookup, input, ref currStateId, ref pos, initialStateId))
+                    if (TAcceleratedStateHandler.TryFindNextStartingPosition(this, input, mtlookup, ref currStateId, ref pos, initialStateId))
                     {
                         if (pos == input.Length)
                         {
@@ -723,8 +682,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                     }
 
                     // If the state is nullable for the next character, we found a potential end state.
-                    if (TOptimizedNullabilityHandler.IsNullable(
-                        this, _nullabilityArray, currStateId, mtlookup, maxChar, input, pos))
+                    if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray, currStateId, mtlookup, input, pos))
                     {
                         endPos = pos;
 
@@ -737,8 +695,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
 
                     // If there is more input available try to transition with the next character.
                     // Note: the order here is important so the transition itself gets taken
-                    if (!DfaStateHandler.TryTakeDFATransition(
-                        this, ref currStateId, OptimizedSmallInputReader.GetPositionId(mtlookup, maxChar, input, pos), timeoutOccursAt) ||
+                    if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, GetMintermId(mtlookup, input, pos), timeoutOccursAt) ||
                         pos >= lengthMinus1)
                     {
                         if (pos + 1 < input.Length)
@@ -755,6 +712,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
                             return true;
 
                         }
+
                         // the end position (-1) was nullable
                         endPos = pos;
                         return true;
@@ -802,6 +760,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
             int endPos = endPosRef;
             int initialStatePos = initialStatePosRef;
             int initialStatePosCandidate = initialStatePosCandidateRef;
+
             int deadStateId = _deadStateId;
             int initialStateId = _initialStateId;
             try
@@ -809,12 +768,13 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
                 // Loop through each character in the input, transitioning from state to state for each.
                 while (true)
                 {
-                    if (state.DfaStateId == deadStateId)
+                    int dfaStateId = state.DfaStateId;
+                    if (dfaStateId == deadStateId)
                     {
                         return true;
                     }
 
-                    if (state.DfaStateId == initialStateId)
+                    if (dfaStateId == initialStateId)
                     {
                         if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
                         {
@@ -827,8 +787,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
-                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state,
-                        positionId, TStateHandler.GetStateFlags(this, in state)))
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         endPos = pos;
 
@@ -1178,7 +1137,7 @@ private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i,
             }
 
             Debug.Assert(current.Count > 0);
-            foreach (var (endStateId, endRegisters) in current.Values)
+            foreach ((int endStateId, Registers endRegisters) in current.Values)
             {
                 MatchingState<TSet> endState = GetState(GetCoreStateId(endStateId));
                 if (endState.IsNullableFor(GetCharKind<TInputReader>(input, iEnd)))
@@ -1194,6 +1153,16 @@ private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i,
             return default;
         }
 
+        /// <summary>Look up the min term ID for the character at the specified position in the input.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int GetMintermId(byte[] mintermLookup, ReadOnlySpan<char> input, int pos)
+        {
+            Debug.Assert(pos >= 0 && pos < input.Length);
+
+            char c = input[pos];
+            return c < (uint)mintermLookup.Length ? mintermLookup[c] : 0;
+        }
+
         /// <summary>Stores additional data for tracking capture start and end positions.</summary>
         /// <remarks>The NFA simulation based third phase has one of these for each current state in the current set of live states.</remarks>
         internal struct Registers(int[] captureStarts, int[] captureEnds)
@@ -1442,8 +1411,8 @@ internal static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, re
             /// - whether this state may be contextually nullable
             /// </summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
-                => matcher._stateFlagsArray[state.DfaStateId];
+            public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state) =>
+                matcher._stateFlagsArray[state.DfaStateId];
         }
 
         /// <summary>An <see cref="IStateHandler"/> for operating over <see cref="CurrentState"/> instances configured as NFA states.</summary>
@@ -1594,16 +1563,16 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher<
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static StateFlags GetStateFlags(SymbolicRegexMatcher<TSet> matcher, in CurrentState state)
             {
-                SparseIntMap<int> stateSet = state.NfaState!.NfaStateSet;
                 // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
                 // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
                 // they are true for any state in the set; SimulatesBacktracking is true for all the states if
                 // it is true for any state (since it is a phase-wide property); and all other flags are masked out.
                 StateFlags flags = 0;
-                foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(stateSet.Values))
+                foreach (ref KeyValuePair<int, int> nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
                 {
                     flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)];
                 }
+
                 return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag);
             }
 
@@ -1628,61 +1597,6 @@ public static void UndoTransition(ref CurrentState state)
 #endif
         }
 
-
-
-        // /// <summary>
-        // /// This reader maps all characters > maxChar to 0
-        // /// </summary>
-        private readonly struct OptimizedSmallInputReader
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
-            {
-                Debug.Assert(pos < input.Length, "pos < input.Length");
-                Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}");
-                char c = input[pos];
-                return c < (uint)lookup.Length ? lookup[c] : 0;
-            }
-        }
-
-        /// <summary>
-        /// This nullability handler interface can be used in DFAs
-        /// for patterns that do not contain \Z
-        /// </summary>
-        private interface IOptimizedNullabilityHandler
-        {
-            public static abstract bool IsNullable(SymbolicRegexMatcher<TSet> matcher,
-                byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input,
-                int pos);
-        }
-
-        private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup,
-                int maxChar, ReadOnlySpan<char> input, int pos)
-            {
-                Debug.Assert(pos < input.Length, "input end should not be handled here");
-                Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
-                return nullabilityArray[currStateId] > 0;
-            }
-        }
-
-        private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
-        {
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher,
-                byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
-            {
-                Debug.Assert(pos < input.Length, "input end should not be handled here");
-                Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
-                return
-                    nullabilityArray[currStateId] > 0 &&
-                    matcher.IsNullableWithContext(currStateId,
-                        input[pos] < (uint)lookup.Length ? lookup[input[pos]] : 0);
-            }
-        }
-
         /// <summary>
         /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to
         /// both take transitions and decide nullability. For positions of valid characters that are handled normally,
@@ -1695,13 +1609,11 @@ private interface IInputReader
             public static abstract int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos);
         }
 
-
-
         /// <summary>This reader omits the special handling of \n for the \Z anchor.</summary>
         private readonly struct NoZAnchorInputReader : IInputReader
         {
             public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos) =>
-                (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]);
+                (uint)pos < (uint)input.Length ? matcher._mintermClassifier.GetMintermID(input[pos]) : -1;
         }
 
         /// <summary>This reader includes full handling of an \n as the last character of input for the \Z anchor.</summary>
@@ -1709,23 +1621,24 @@ public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan
         {
             public static int GetPositionId(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, int pos)
             {
-                if ((uint)pos >= (uint)input.Length)
-                    return -1;
-
-                int c = input[pos];
+                if ((uint)pos < (uint)input.Length)
+                {
+                    // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
+                    int c = input[pos];
+                    return c == '\n' && pos == input.Length - 1 ?
+                        matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input
+                        matcher._mintermClassifier.GetMintermID(c);
+                }
 
-                // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
-                return c == '\n' && pos == input.Length - 1 ?
-                    matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input
-                    matcher._mintermClassifier.GetMintermID(c);
+                return -1;
             }
         }
 
-
         private interface IInitialStateHandler
         {
-            public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher,
-                ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
+            public static abstract bool TryFindNextStartingPosition<TInputReader>(
+                SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input,
+                ref CurrentState state, ref int pos)
                 where TInputReader : struct, IInputReader;
         }
 
@@ -1735,54 +1648,55 @@ public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRe
         private interface IAcceleratedStateHandler
         {
             public static abstract bool TryFindNextStartingPosition(
-                SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input,
-                ref int currentStateId, ref int pos, int initialStateId);
+                SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input,
+                byte[] lookup, ref int currentStateId, ref int pos, int initialStateId);
         }
 
         private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher,
-                byte[] lookup, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
+            public static bool TryFindNextStartingPosition(
+                SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId)
             {
                 if (currentStateId != initialStateId)
                 {
                     return false;
                 }
 
-                if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+                if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
-                    return true;
+                    // No match exists
+                    currentStateId = matcher._deadStateId;
+                    pos = input.Length;
                 }
 
-                // No match exists
-                currentStateId = matcher._deadStateId;
-                pos = input.Length;
                 return true;
             }
         }
+
         private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher,
-                byte[] lookup,
-                ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
+                ReadOnlySpan<char> input,
+                byte[] lookup, ref int currentStateId, ref int pos, int initialStateId)
             {
                 if (currentStateId != initialStateId)
+                {
                     return false;
+                }
 
                 if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
-                    currentStateId = matcher._dotstarredInitialStates[
-                        matcher._positionKinds[
-                            OptimizedSmallInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1]
-                    ].Id;
-                    return true;
+                    currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input, pos - 1) + 1]].Id;
+                }
+                else
+                {
+                    // No match exists
+                    currentStateId = matcher._deadStateId;
+                    pos = input.Length;
                 }
 
-                // No match exists
-                currentStateId = matcher._deadStateId;
-                pos = input.Length;
                 return true;
             }
         }
@@ -1790,26 +1704,18 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matche
         private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            public static bool TryFindNextStartingPosition(SymbolicRegexMatcher<TSet> matcher,
-                byte[] lookup,
-                ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
-            {
-                return false;
-            }
+            public static bool TryFindNextStartingPosition(
+                SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) =>
+                false;
         }
 
-        /// <summary>
-        /// No-op handler for when there are no initial state optimizations to apply.
-        /// </summary>
+        /// <summary>No-op handler for when there are no initial state optimizations to apply.</summary>
         private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatcher<TSet> matcher, ReadOnlySpan<char> input, ref CurrentState state, ref int pos)
-                where TInputReader : struct, IInputReader
-            {
-                // return true to indicate that the current position is a possible starting position
-                return true;
-            }
+                where TInputReader : struct, IInputReader =>
+                true; // the current position is a possible starting position
         }
 
         /// <summary>
@@ -1822,26 +1728,33 @@ public static bool TryFindNextStartingPosition<TInputReader>(SymbolicRegexMatche
                 where TInputReader : struct, IInputReader
             {
                 // Find the first position that matches with some likely character.
-                if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+                if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
-                    // No match exists
-                    return false;
+                    // Update the starting state based on where TryFindNextStartingPosition moved us to.
+                    // As with the initial starting state, if it's a dead end, no match exists.
+                    state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind<TInputReader>(input, pos - 1)]);
+                    return true;
                 }
 
-                // Update the starting state based on where TryFindNextStartingPosition moved us to.
-                // As with the initial starting state, if it's a dead end, no match exists.
-                state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind<TInputReader>(input, pos - 1)]);
-                return true;
+                // No match exists
+                return false;
             }
         }
 
-        /// <summary>
-        /// Interface for evaluating nullability of states.
-        /// </summary>
+        /// <summary>Interface for evaluating nullability of states.</summary>
         private interface INullabilityHandler
         {
-            public static abstract bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId, StateFlags flags)
-                    where TStateHandler : struct, IStateHandler;
+            public static abstract bool IsNullableAt<TStateHandler>(
+                SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId, StateFlags flags)
+                where TStateHandler : struct, IStateHandler;
+        }
+
+        /// <summary>This nullability handler interface can be used in DFAs for patterns that do not contain \Z.</summary>
+        private interface IOptimizedNullabilityHandler
+        {
+            public static abstract bool IsNullable(
+                SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId,
+                byte[] lookup, ReadOnlySpan<char> input, int pos);
         }
 
         /// <summary>
@@ -1865,9 +1778,37 @@ public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matche
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             public static bool IsNullableAt<TStateHandler>(SymbolicRegexMatcher<TSet> matcher, in CurrentState state, int positionId, StateFlags flags)
-                where TStateHandler : struct, IStateHandler
+                where TStateHandler : struct, IStateHandler =>
+                flags.IsNullable() ||
+                (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
+        }
+
+        private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
             {
-                return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
+                Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here");
+                Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
+                return nullabilityArray[currStateId] > 0;
+            }
+        }
+
+        private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static bool IsNullable(SymbolicRegexMatcher<TSet> matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan<char> input, int pos)
+            {
+                Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here");
+                Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
+
+                if (nullabilityArray[currStateId] > 0)
+                {
+                    char c = input[pos];
+                    return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0);
+                }
+
+                return false;
             }
         }
     }
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
index 4309054c354e6..5384810092b7f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
@@ -185,7 +185,7 @@ internal bool CanBeNullable
         public List<SymbolicRegexNode<TSet>> ToList(List<SymbolicRegexNode<TSet>>? list = null, SymbolicRegexNodeKind listKind = SymbolicRegexNodeKind.Concat)
         {
             Debug.Assert(listKind is SymbolicRegexNodeKind.Concat or SymbolicRegexNodeKind.Alternate);
-            list ??= new List<SymbolicRegexNode<TSet>>();
+            list ??= [];
             AppendToList(this, list, listKind);
             return list;
 
@@ -394,10 +394,11 @@ SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor
                 SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or
                 SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
                 SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
-            return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is
-                    SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
-                    SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor,
-                        kind is SymbolicRegexNodeKind.EndAnchorZ));
+            return Create(
+                builder, kind, null, null, -1, -1, default,
+                SymbolicRegexInfo.Anchor(
+                    isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor,
+                    isEndZAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ));
         }
 
         #endregion
@@ -541,8 +542,8 @@ internal static SymbolicRegexNode<TSet> CreateAlternate(SymbolicRegexBuilder<TSe
                 right.ToList(elems, listKind: SymbolicRegexNodeKind.Alternate);
 
                 // Eliminate any duplicate elements, keeping the leftmost element
-                HashSet<SymbolicRegexNode<TSet>> seenElems = new();
                 // Keep track of if any elements from the right side need to be eliminated
+                HashSet<SymbolicRegexNode<TSet>> seenElems = [];
                 bool rightChanged = false;
                 for (int i = 0; i < elems.Count; i++)
                 {
@@ -836,7 +837,7 @@ private static bool TryFoldAlternation(SymbolicRegexBuilder<TSet> builder, Symbo
             static bool TrySplitConcatSubsumption(SymbolicRegexBuilder<TSet> builder, SymbolicRegexNode<TSet> left, SymbolicRegexNode<TSet> right,
                 [NotNullWhen(true)] out SymbolicRegexNode<TSet>? prefix)
             {
-                List<SymbolicRegexNode<TSet>> prefixElements = new();
+                List<SymbolicRegexNode<TSet>> prefixElements = [];
                 SymbolicRegexNode<TSet> suffix = right;
                 while (suffix._kind == SymbolicRegexNodeKind.Concat)
                 {
@@ -1052,7 +1053,7 @@ public SymbolicRegexNode<TSet> AddFixedLengthMarkers(SymbolicRegexBuilder<TSet>
         /// <returns>the derivative</returns>
         internal List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder<TSet> builder, TSet elem, uint context)
         {
-            List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> transitions = new();
+            List<(SymbolicRegexNode<TSet>, DerivativeEffect[])> transitions = [];
             CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions);
             return transitions;
         }
@@ -1085,9 +1086,8 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(SymbolicRegexB
                 return this;
 
             // Cache result to avoid otherwise potential quadratic worst case behavior
-            SymbolicRegexNode<TSet>? prunedNode;
             (SymbolicRegexNode<TSet>, uint) key = (this, context);
-            if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode))
+            if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out SymbolicRegexNode<TSet>? prunedNode))
             {
                 return prunedNode;
             }
@@ -1254,9 +1254,8 @@ private SymbolicRegexNode<TSet> CreateDerivative(SymbolicRegexBuilder<TSet> buil
                 return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context);
             }
 
-            SymbolicRegexNode<TSet>? derivative;
             (SymbolicRegexNode<TSet>, TSet, uint) key = (this, elem, context);
-            if (builder._derivativeCache.TryGetValue(key, out derivative))
+            if (builder._derivativeCache.TryGetValue(key, out SymbolicRegexNode<TSet>? derivative))
             {
                 return derivative;
             }
@@ -1434,7 +1433,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder<TSet> builder, uint contex
                 return;
             }
 
-            currentEffects ??= new List<DerivativeEffect>();
+            currentEffects ??= [];
 
             // If we've reached a node with no effects, then output that with the effects that have been accumulated
             if (!_info.ContainsEffect)
@@ -1469,7 +1468,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder<TSet> builder, uint contex
                         _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
                         for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++)
                         {
-                            var (node, effects) = alternativesAndEffects[i];
+                            (SymbolicRegexNode<TSet> node, DerivativeEffect[] effects) = alternativesAndEffects[i];
                             alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects);
                         }
                         break;
@@ -1507,7 +1506,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder<TSet> builder, uint contex
                         _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
                         for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++)
                         {
-                            var (node, effects) = alternativesAndEffects[i];
+                            (SymbolicRegexNode<TSet> node, DerivativeEffect[] effects) = alternativesAndEffects[i];
                             alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects);
                         }
                         break;
@@ -1896,12 +1895,8 @@ private void CollectSets(SymbolicRegexBuilder<TSet> builder, HashSet<TSet> sets)
         }
 
         /// <summary>Compute and sort all the minterms from the sets in this regex.</summary>
-        public TSet[] ComputeMinterms(SymbolicRegexBuilder<TSet> builder)
-        {
-            HashSet<TSet> sets = GetSets(builder);
-            List<TSet> minterms = MintermGenerator<TSet>.GenerateMinterms(builder._solver, sets);
-            return minterms.ToArray();
-        }
+        public TSet[] ComputeMinterms(SymbolicRegexBuilder<TSet> builder) =>
+            MintermGenerator<TSet>.GenerateMinterms(builder._solver, GetSets(builder)).ToArray();
 
         /// <summary>
         /// Create the reverse of this regex
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index bf7d5a6501699..5d73a3e232e80 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -17,11 +17,8 @@ internal static class SymbolicRegexThresholds
         /// an NFA. As an NFA, we instead track all of the states we're in at any given point.
         /// </remarks>
         /// <remarks>
-        /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance
-        /// is currently approx. 50 MB.
-        /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially
-        /// significant search-time performance gains. Worst case memory consumption for the regex instance
-        /// can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode))
+        /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance is currently ~50 MB.
+        /// Worst case memory consumption for the regex instance can be approximated to ~(NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode))
         /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state.
         /// </remarks>
         internal const int NfaNodeCountThreshold = 125_000;
@@ -34,8 +31,8 @@ internal static class SymbolicRegexThresholds
         /// This default value may be overridden with the AppContext data
         /// whose name is given by  <see cref="SymbolicRegexSafeSizeThreshold_ConfigKeyName"/>.
         /// </remarks>
-        /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s,
-        /// although it could be safely raised higher at the expense of worst-case NFA performance
+        /// This limit is chosen due to worst case NFA speed constraints,
+        /// although it could be safely raised higher at the expense of worst-case NFA performance.
         /// </summary>
         internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000;
 
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 6ad2275f9584b..1f0e2932c6425 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -2653,36 +2653,25 @@ public static IEnumerable<object[]> MatchWordsInAnchoredRegexes_TestData()
                 yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } };
             }
         }
-#if NET
+
         [Fact]
         public async Task MatchNonBacktrackingOver255Minterms()
         {
-            // This is a test for the rare over 255 unique minterms case in MintermClassifier
-            StringBuilder pattern = new();
-            StringBuilder input = new();
-            for (int i = 128; i <= 400; i++)
-            {
-                char c = (char)i;
-                pattern.Append(c);
-                // adding an optional char as well just so it's not a string literal
-                pattern.Append(c);
-                pattern.Append('?');
-                // input is the pattern itself
-                input.Append(c);
-            }
+            // While valid on all engines, this test in particular is designed to exercise the rare case
+            // of more than 255 unique minterms case in the non-backtracking engine's minterm classifier.
 
-            string patternString = pattern.ToString();
-            string inputString = input.ToString();
+            IEnumerable<char> chars = Enumerable.Range(128, 400 - 128).Select(i => (char)i);
+            string patternString = string.Concat(chars.Select(c => $"{c}{c}?")); // adding an optional char as well just so it's not a string literal
+            string inputString = string.Concat(chars);
 
             foreach (RegexEngine engine in RegexHelpers.AvailableEngines)
             {
-                Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None);
+                Regex r = await RegexHelpers.GetRegexAsync(engine, patternString);
                 MatchCollection ms = r.Matches(inputString);
                 Assert.Equal(1, ms.Count);
                 Assert.Equal(0, ms[0].Index);
-                Assert.Equal(273, ms[0].Length);
+                Assert.Equal(272, ms[0].Length);
             }
         }
-#endif
     }
 }