From 34eba54a2f3480d2cd70f0f59d04690bb5f68e8a Mon Sep 17 00:00:00 2001 From: ieviev Date: Fri, 24 May 2024 12:37:43 +0300 Subject: [PATCH 01/63] Regex automata optimizations --- .../Symbolic/MatchingState.cs | 8 +- .../Symbolic/MintermClassifier.cs | 51 ++-- .../RegularExpressions/Symbolic/StateFlags.cs | 6 +- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 16 +- .../Symbolic/SymbolicRegexMatcher.cs | 261 ++++++++++++++++-- .../Symbolic/SymbolicRegexThresholds.cs | 7 +- 6 files changed, 282 insertions(+), 67 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index dce65a9996330..3c3029fb5a451 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -106,10 +106,9 @@ internal bool IsNullableFor(uint nextCharKind) /// /// Builds a with the relevant flags set. /// - /// a solver for /// whether this state is an initial state /// the flags for this matching state - internal StateFlags BuildStateFlags(ISolver solver, bool isInitial) + internal StateFlags BuildStateFlags(bool isInitial) { StateFlags info = 0; @@ -118,11 +117,6 @@ internal StateFlags BuildStateFlags(ISolver solver, bool isInitial) info |= StateFlags.IsInitialFlag; } - if (IsDeadend(solver)) - { - info |= StateFlags.IsDeadendFlag; - } - if (Node.CanBeNullable) { info |= StateFlags.CanBeNullableFlag; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index d00fcc0d62ff4..9fd9f85ac4f96 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -21,16 +21,15 @@ namespace System.Text.RegularExpressions.Symbolic internal sealed class MintermClassifier { /// An array used when there's a single minterm, in order to map every ASCII character to it trivially. - private static readonly int[] AllAsciiIsZeroMintermArray = new int[128]; + // private static readonly int[] AllAsciiIsZeroMintermArray = new int[128]; + private readonly int[] _lookup; - /// Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID. - private readonly int[] _ascii; - /// A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID. - /// - /// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further, - /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed. - /// - private readonly BDD _nonAscii; + // /// A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID. + // /// + // /// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further, + // /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed. + // /// + // private readonly BDD _nonAscii; /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. @@ -39,12 +38,13 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver) { Debug.Assert(minterms.Length > 0, "Requires at least"); + var lookup = new int[ushort.MaxValue]; if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). // For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0. - _ascii = AllAsciiIsZeroMintermArray; - _nonAscii = solver.ReplaceTrue(BDD.True, 0); + _lookup = lookup; + // _nonAscii = solver.ReplaceTrue(BDD.True, 0); return; } @@ -65,36 +65,21 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver) anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId); } - // Now that we have our mapping that supports any input character, we want to optimize for - // ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match - // time, we precompute a lookup table, where each ASCII character can be used to index into the - // array to determine the ID for its corresponding minterm. - var ascii = new int[128]; - for (int i = 0; i < ascii.Length; i++) + // TODO: this could be initialized more efficiently but it's + // a fundamentally different design choice that preallocates more memory. + // the minterm slice [1..] contains the ranges that should be really initialized + for (int i = 0; i < ushort.MaxValue; i++) { - ascii[i] = anyCharacterToMintermId.Find(i); + lookup[i] = anyCharacterToMintermId.Find(i); } - _ascii = ascii; - - // We can also further optimize the BDD in two ways: - // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first - // for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not - // affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD. - // 2. We can check if every character now maps to the same minterm ID (the same terminal in the - // multi-terminal BDD). This can be relatively common after (1) above is applied, as many - // patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character - // in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one. - BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver.NonAscii); - nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD; - _nonAscii = nonAsciiBDD; + _lookup = lookup; } /// Gets the ID of the minterm associated with the specified character. [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { - int[] ascii = _ascii; - return (uint)c < (uint)ascii.Length ? ascii[c] : _nonAscii.Find(c); + return _lookup[c]; } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs index 5a620f3771be6..cef4fdfc1ed13 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs @@ -6,7 +6,7 @@ namespace System.Text.RegularExpressions.Symbolic /// /// These flags provide context-independent information available for every state. They provide a fast way to evaluate /// conditions in the inner matching loops of . The matcher caches one of these - /// for every state, for which they are created by . + /// for every state, for which they are created by . /// In DFA mode the cached flags are used directly, while in NFA mode the /// handles aggregating the flags in the state set. /// @@ -14,10 +14,10 @@ namespace System.Text.RegularExpressions.Symbolic internal enum StateFlags : byte { IsInitialFlag = 1, - IsDeadendFlag = 2, IsNullableFlag = 4, CanBeNullableFlag = 8, SimulatesBacktrackingFlag = 16, + IsAcceleratedFlag = 32, } /// @@ -26,9 +26,9 @@ internal enum StateFlags : byte internal static class StateFlagsExtensions { internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != 0; - internal static bool IsDeadend(this StateFlags info) => (info & StateFlags.IsDeadendFlag) != 0; internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != 0; internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != 0; internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != 0; + internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != 0; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 278b69fe391fe..d20fe6ef13bca 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -40,6 +40,16 @@ internal sealed partial class SymbolicRegexMatcher /// private StateFlags[] _stateFlagsArray; + /// + /// Used to short-circuit nullability in the hot loop + /// + private bool[] _canBeNullableArray; + + /// + /// Used to short-circuit accelerated states in the hot loop + /// + private bool[] _canBeAcceleratedArray; + /// /// The transition function for DFA mode. /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is @@ -178,9 +188,13 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node ArrayResizeAndVolatilePublish(ref _stateArray, newsize); ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog); ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize); + ArrayResizeAndVolatilePublish(ref _canBeNullableArray, newsize); + ArrayResizeAndVolatilePublish(ref _canBeAcceleratedArray, newsize); } _stateArray[state.Id] = state; - _stateFlagsArray[state.Id] = state.BuildStateFlags(Solver, isInitialState); + _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState); + _canBeNullableArray[state.Id] = _stateFlagsArray[state.Id].CanBeNullable(); + _canBeAcceleratedArray[state.Id] = _stateFlagsArray[state.Id].IsAccelerated(); } return state; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 4394329f8eae2..a27bcf26cbd53 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -81,6 +81,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Data and routines for skipping ahead to the next place a match could potentially start. private readonly RegexFindOptimizations? _findOpts; + /// TODO: summarize + private readonly int _deadStateId; + /// The initial states for the original pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _initialStates; @@ -172,6 +175,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo // Initialization for fields in SymbolicRegexMatcher.Automata.cs _stateArray = new MatchingState[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; + _canBeNullableArray = new bool[InitialDfaStateCapacity]; + _canBeAcceleratedArray = new bool[InitialDfaStateCapacity]; _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog]; // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm @@ -189,7 +194,28 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo if (findOptimizations.IsUseful && findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning) { - _findOpts = findOptimizations; + var setIsTooCommon = new Func((fds) => + { + return fds switch + { + // anything above 4 uint16 chars is generally slower than DFA + { Chars: not null } => fds.Chars.Length > 4, + { Range: not null } => false, + { Set: not null } => true, + _ => false + }; + }); + // a DFA is sometimes 10x-100x faster than the optimizations + // the "IsUseful" is harming the engine here + _findOpts = findOptimizations switch + { + { FindMode: FindNextStartingPositionMode.FixedDistanceString_LeftToRight } => findOptimizations, + { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } => + findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke)? null : findOptimizations, + { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } => setIsTooCommon( + findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations, + _ => findOptimizations // TODO: unsure which options are left here + }; } // Determine the number of initial states. If there's no anchor, only the default previous @@ -199,6 +225,9 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo // The loops below and how character kinds are calculated assume that the "general" character kind is zero Debug.Assert(CharKind.General == 0); + // Assign dead state id + _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id; + // Create the initial states for the original pattern. var initialStates = new MatchingState[statesCount]; for (uint charKind = 0; charKind < initialStates.Length; charKind++) @@ -448,8 +477,9 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - FindEndPositionDeltas(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + _findOpts is null ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -483,7 +513,75 @@ private int FindEndPosition + /// TODO: this is essentially a stripped down version when there's no good prefix optimizations + /// i don't trust the compiler to optimize this and it makes a + /// ~50% difference in performance with removing unnecessary checks alone + /// + private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length, RegexRunnerMode mode, + ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TNullabilityHandler : struct, INullabilityHandler + { + // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. + int pos = posRef; + int endPos = endPosRef; + int endStateId = endStateIdRef; + int initialStatePos = initialStatePosRef; + int initialStatePosCandidate = initialStatePosCandidateRef; + try + { + // Loop through each character in the input, transitioning from state to state for each. + while (true) + { + if (state.DfaStateId == _deadStateId) + { + return true; + } + + int positionId = TInputReader.GetPositionId(this, input, pos); + + // If the state is nullable for the next character, meaning it accepts the empty string, + // we found a potential end state. + if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + { + endPos = pos; + endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); + initialStatePos = initialStatePosCandidate; + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) + { + return true; + } + } + + // If there is more input available try to transition with the next character. + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) + { + return false; + } + + // We successfully transitioned, so update our current input index to match. + pos++; + } + } + finally + { + // Write back the local copies of the ref values. + posRef = pos; + endPosRef = endPos; + endStateIdRef = endStateId; + initialStatePosRef = initialStatePos; + initialStatePosCandidateRef = initialStatePosCandidate; + } + } + + /// + /// TODO: this is a separate DFA function that takes advantage of short circuit array lookups /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. @@ -500,7 +598,7 @@ private int FindEndPosition - private bool FindEndPositionDeltas(ReadOnlySpan input, int length, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader @@ -518,22 +616,101 @@ private bool FindEndPositionDeltas(this, input, ref state, ref pos)) { return true; } - initialStatePosCandidate = pos; } - // If the state is a dead end, such that we can't transition anywhere else, end the search. - if (flags.IsDeadend()) + int positionId = TInputReader.GetPositionId(this, input, pos); + + // If the state is nullable for the next character, meaning it accepts the empty string, + // we found a potential end state. + if (_canBeNullableArray[state.DfaStateId] && TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + { + endPos = pos; + endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); + initialStatePos = initialStatePosCandidate; + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) + { + return true; + } + } + + // If there is more input available try to transition with the next character. + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) + { + return false; + } + + // We successfully transitioned, so update our current input index to match. + pos++; + } + } + finally + { + // Write back the local copies of the ref values. + posRef = pos; + endPosRef = endPos; + endStateIdRef = endStateId; + initialStatePosRef = initialStatePos; + initialStatePosCandidateRef = initialStatePosCandidate; + } + } + + /// + /// TODO: this is the fallback NFA function + /// Workhorse inner loop for . Consumes the character by character, + /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, + /// lazily building out the graph as needed. + /// + /// + /// The supplies the actual transitioning logic, controlling whether processing is + /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, + /// so for example if is a , it expects the 's + /// to be non-negative and its to be null; vice versa for + /// . + /// + /// + /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. + /// 0 if iteration completed because we reached an initial state. + /// A negative value if iteration completed because we ran out of input or we failed to transition. + /// + private bool FindEndPositionDeltasNFA(ReadOnlySpan input, int length, RegexRunnerMode mode, + ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TFindOptimizationsHandler : struct, IInitialStateHandler + where TNullabilityHandler : struct, INullabilityHandler + { + // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. + int pos = posRef; + int endPos = endPosRef; + int endStateId = endStateIdRef; + int initialStatePos = initialStatePosRef; + int initialStatePosCandidate = initialStatePosCandidateRef; + try + { + // Loop through each character in the input, transitioning from state to state for each. + while (true) + { + StateFlags flags = TStateHandler.GetStateFlags(this, in state); + + // Dead end here means the set is empty + if (state.NfaState!.NfaStateSet.Count == 0) { return true; } @@ -608,8 +785,8 @@ private int FindStartPosition(ReadOnlySpan(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : - FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart); + FindStartPositionDeltasNFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : + FindStartPositionDeltasDFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart); // If we found the starting position, we're done. if (done) @@ -635,7 +812,7 @@ private int FindStartPosition(ReadOnlySpan, for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindStartPositionDeltas(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) + private bool FindStartPositionDeltasDFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler @@ -647,20 +824,66 @@ private bool FindStartPositionDeltas(this, in state, positionId, + TStateHandler.GetStateFlags(this, in state))) + { + lastStart = pos; + } + + // If we are past the start threshold or if the state is a dead end, bail; we should have already + // found a valid starting location. + if (pos <= startThreshold || state.DfaStateId == _deadStateId) + { + Debug.Assert(lastStart != -1); + return true; + } + + // Try to transition with the next character, the one before the current position. + if (!TStateHandler.TryTakeTransition(this, ref state, positionId)) + { + // Return false to indicate the search didn't finish. + return false; + } + + // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input. + pos--; + } + } + finally + { + // Write back the local copies of the ref values. + i = pos; + } + } + private bool FindStartPositionDeltasNFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TNullabilityHandler : struct, INullabilityHandler + { + // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. + int pos = i; + try + { + // Loop backwards through each character in the input, transitioning from state to state for each. + while (true) + { int positionId = TInputReader.GetPositionId(this, input, pos - 1); // If the state accepts the empty string, we found a valid starting position. Record it and keep going, // since we're looking for the earliest one to occur within bounds. - if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, + TStateHandler.GetStateFlags(this, in state))) { lastStart = pos; } // If we are past the start threshold or if the state is a dead end, bail; we should have already // found a valid starting location. - if (pos <= startThreshold || flags.IsDeadend()) + if (pos <= startThreshold || state.DfaStateId == _deadStateId) { Debug.Assert(lastStart != -1); return true; @@ -746,7 +969,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int coreStateId = GetCoreStateId(targetStateId); StateFlags flags = _stateFlagsArray[coreStateId]; - Debug.Assert(!flags.IsDeadend()); + Debug.Assert(coreStateId != _deadStateId); if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1)))) { @@ -1150,12 +1373,6 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher< public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) { SparseIntMap stateSet = state.NfaState!.NfaStateSet; - if (stateSet.Count == 0) - { - // In NFA state sets dead ends are never included. Instead an empty set of states represents a dead end. - return StateFlags.IsDeadendFlag; - } - else { // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index 6057827e1d53f..c0118d52553ff 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -22,8 +22,13 @@ internal static class SymbolicRegexThresholds /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex. /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing /// to create a new node and the graph is already or newly beyond this threshold. + /// TODO: summarize this + /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s + /// with an entirely different search-time algorithmic complexity + /// 100_000 isn't a really a high memory cost either, + /// i'd even put 1_000_000 on the table but that might push it for general purpose use /// - internal const int NfaThreshold = 10_000; + internal const int NfaThreshold = 100_000; /// /// Default maximum estimated safe expansion size of a AST From 49607f42e3c4f24dfff979d04bb24b0f5ce04fc2 Mon Sep 17 00:00:00 2001 From: ieviev Date: Fri, 24 May 2024 15:13:52 +0300 Subject: [PATCH 02/63] off by one err --- .../Text/RegularExpressions/Symbolic/MintermClassifier.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 9fd9f85ac4f96..3e97273e726f7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -38,7 +38,7 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver) { Debug.Assert(minterms.Length > 0, "Requires at least"); - var lookup = new int[ushort.MaxValue]; + var lookup = new int[ushort.MaxValue + 1]; if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). @@ -68,7 +68,7 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver) // TODO: this could be initialized more efficiently but it's // a fundamentally different design choice that preallocates more memory. // the minterm slice [1..] contains the ranges that should be really initialized - for (int i = 0; i < ushort.MaxValue; i++) + for (int i = 0; i <= ushort.MaxValue; i++) { lookup[i] = anyCharacterToMintermId.Find(i); } From 5ac29f36906e6afff5b3b1835d7155ff524e4bf6 Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 26 May 2024 21:52:19 +0300 Subject: [PATCH 03/63] wip reversal optimizations --- .../src/System.Text.RegularExpressions.csproj | 2 + .../Symbolic/SymbolicRegexMatcher.Automata.cs | 72 +++++++++++++++++++ .../Symbolic/SymbolicRegexMatcher.cs | 30 +++++++- .../FunctionalTests/NonBacktrackingTests.cs | 22 ++++++ ...ystem.Text.RegularExpressions.Tests.csproj | 1 + 5 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 2042b930fdd2c..cb9c7e35ff972 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,6 +5,8 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index d20fe6ef13bca..cf2cfa196398f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -50,6 +50,15 @@ internal sealed partial class SymbolicRegexMatcher /// private bool[] _canBeAcceleratedArray; +#if DEBUG + // private readonly Action _wout = st => + // { + // var a_cons = System.Reflection.Assembly.Load("System.Console"); + // var t_cons = a_cons.GetType("System.Console")!; + // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); + // wl!.Invoke(null, [st]); + // }; +#endif /// /// The transition function for DFA mode. /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is @@ -162,6 +171,69 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint return GetOrCreateState_NoLock(node, prevCharKind); } + /// + /// Optimized reversal state computation which takes skips the fixed length parts + /// + /// + /// + private (int, MatchingState) CreateOptimizedReversal(SymbolicRegexNode node) + { + var pos = 0; + var current = node; + var canLoop = true; + var incrPos = new Func<(int, SymbolicRegexNode), (bool, SymbolicRegexNode)>(value => + { + pos += value.Item1; + return (true, value.Item2); + }); + var decrLoop = new Func, (bool, SymbolicRegexNode)>(value => + { + var concat = value; + var loop = concat._left; + switch (loop!._left!.Kind) + { + case SymbolicRegexNodeKind.Singleton: + if (loop._lower == loop._upper) + { + pos += loop._lower; + return (true, concat._right!); + } + if (loop._lower > 0) + { + var delta = loop._upper - loop._lower; + var newLeft = _builder.CreateLoop(loop._left, loop.IsLazy, 0, delta); + var newNode = _builder.CreateConcat(newLeft, concat._right!); + pos += loop._lower; + return (true, newNode); + } + return (false, concat); + default: + return (false, concat); + } + }); + while (canLoop) + { +#if DEBUG + // _wout($"{pos} {current._kind} l:{current._left!._kind} {current}"); +#endif + (bool loop, SymbolicRegexNode next) = current switch + { + {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} => + (true, current._right!), + {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => + (true, current._right!), + {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => + incrPos((1, current._right!)), + {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => + decrLoop(current), + _ => (false, current) + }; + canLoop = loop; + current = next; + } + return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0, false)); + } + /// /// Create a state with given node and previous character context. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index a27bcf26cbd53..0f7bf2c01cd78 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -96,6 +96,8 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _reverseInitialStates; + private readonly (int, MatchingState) _optimizedReversalState; + /// Partition of the input space of sets. private readonly TSet[] _minterms; @@ -172,6 +174,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo ((BitVectorSolver)(object)builder._solver)._classifier; _capsize = captureCount; + // Initialization for fields in SymbolicRegexMatcher.Automata.cs _stateArray = new MatchingState[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; @@ -262,6 +265,9 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo } _reverseInitialStates = reverseInitialStates; + // Create optimized reversal + _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder)); + // Maps a minterm ID to a character kind uint CalculateMintermIdKind(int mintermId) { @@ -776,9 +782,29 @@ private int FindStartPosition(ReadOnlySpan(input, i)]); - + CurrentState currentState; int lastStart = -1; // invalid sentinel value + // if possible use optimized reversal instead + if (_optimizedReversalState.Item1 > 0) + { + i -= _optimizedReversalState.Item1; + currentState = new CurrentState(_optimizedReversalState.Item2); + // anchor variant may need context to be computed if nullable + if (_pattern._info.ContainsSomeAnchor && _canBeNullableArray[currentState.DfaStateId]) + { + int positionId = TInputReader.GetPositionId(this, input, i); + if (TNullabilityHandler.IsNullableAt(this, + in currentState, positionId, + DfaStateHandler.GetStateFlags(this, in currentState))) + { + lastStart = i; + } + } + } + else + { + currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); + } // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary. while (true) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs new file mode 100644 index 0000000000000..501df78391690 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections; +using System.Collections.Generic; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + /// + /// TODO: Create tests here later + /// + public static partial class NonBacktrackingTests + { + + // [Fact] + // public static void Test() + // { + // } + + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj index dbab47f63d097..afdd6f1e51f24 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj @@ -19,6 +19,7 @@ + From e440dec535fba3c368f2e1e9830d7473ccb141b1 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 27 May 2024 00:28:55 +0300 Subject: [PATCH 04/63] removing unnecessary overhead --- .../src/System.Text.RegularExpressions.csproj | 3 +- .../Symbolic/MatchingState.cs | 3 +- .../Symbolic/MintermClassifier.cs | 1 + .../RegularExpressions/Symbolic/StateFlags.cs | 21 +++- .../Symbolic/SymbolicRegexInfo.cs | 21 ++-- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 1 + .../Symbolic/SymbolicRegexMatcher.cs | 100 +++++++++++++----- .../Symbolic/SymbolicRegexNode.cs | 3 +- 8 files changed, 112 insertions(+), 41 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index cb9c7e35ff972..a6f7119d2fd2f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,8 +5,7 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - - + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 3c3029fb5a451..da7128b464da5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -99,8 +99,7 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m internal bool IsNullableFor(uint nextCharKind) { Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); - uint context = CharKind.Context(PrevCharKind, nextCharKind); - return Node.IsNullableFor(context); + return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind)); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 3e97273e726f7..3810f35f69f84 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -81,5 +81,6 @@ public int GetMintermID(int c) { return _lookup[c]; } + public int[] Lookup => _lookup; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs index cef4fdfc1ed13..990eb4807c7f1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs @@ -1,6 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Runtime.CompilerServices; + namespace System.Text.RegularExpressions.Symbolic { /// @@ -13,6 +15,7 @@ namespace System.Text.RegularExpressions.Symbolic [Flags] internal enum StateFlags : byte { + None = 0, IsInitialFlag = 1, IsNullableFlag = 4, CanBeNullableFlag = 8, @@ -25,10 +28,18 @@ internal enum StateFlags : byte /// internal static class StateFlagsExtensions { - internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != 0; - internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != 0; - internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != 0; - internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != 0; - internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != 0; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != StateFlags.None; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index ff95195292bfa..750fbed4774bf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -17,6 +17,7 @@ namespace System.Text.RegularExpressions.Symbolic private const uint IsHighPriorityNullableMask = 64; private const uint ContainsEffectMask = 128; private const uint ContainsLineAnchorMask = 256; + private const uint ContainsEndZAnchorMask = 512; private readonly uint _info; @@ -26,7 +27,7 @@ private static SymbolicRegexInfo Create( bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false, bool containsLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, - bool isHighPriorityNullable = false, bool containsEffect = false) + bool isHighPriorityNullable = false, bool containsEffect = false, bool containsEndZAnchor = false) { // Assert that the expected implications hold. For example, every node that contains a line anchor // must also be marked as containing some anchor. @@ -43,7 +44,8 @@ private static SymbolicRegexInfo Create( (startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) | (containsSomeAnchor ? ContainsSomeAnchorMask : 0) | (isHighPriorityNullable ? IsHighPriorityNullableMask : 0) | - (containsEffect ? ContainsEffectMask : 0)); + (containsEffect ? ContainsEffectMask : 0) | + (containsEndZAnchor ? ContainsEndZAnchorMask : 0)); } public bool IsNullable => (_info & IsAlwaysNullableMask) != 0; @@ -53,7 +55,6 @@ private static SymbolicRegexInfo Create( public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0; public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0; - public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0; public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0; @@ -63,6 +64,7 @@ private static SymbolicRegexInfo Create( public bool IsHighPriorityNullable => (_info & IsHighPriorityNullableMask) != 0; public bool ContainsEffect => (_info & ContainsEffectMask) != 0; + public bool ContainsEndZAnchor => (_info & ContainsEndZAnchorMask) != 0; /// /// Used for any node that acts as an epsilon, i.e., something that always matches the empty string. @@ -77,13 +79,15 @@ public static SymbolicRegexInfo Epsilon() => /// Used for all anchors. /// /// whether this anchor is a line anchor - public static SymbolicRegexInfo Anchor(bool isLineAnchor) => + /// whether this anchor is an end Z anchor + public static SymbolicRegexInfo Anchor(bool isLineAnchor, bool isEndZAnchor) => Create( canBeNullable: true, startsWithLineAnchor: isLineAnchor, containsLineAnchor: isLineAnchor, startsWithSomeAnchor: true, - containsSomeAnchor: true); + containsSomeAnchor: true, + containsEndZAnchor: isEndZAnchor); /// /// The alternation remains high priority nullable if the left alternative is so. @@ -99,7 +103,8 @@ public static SymbolicRegexInfo Alternate(SymbolicRegexInfo left_info, SymbolicR startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor, containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable, - containsEffect: left_info.ContainsEffect || right_info.ContainsEffect); + containsEffect: left_info.ContainsEffect || right_info.ContainsEffect, + containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor); /// /// Concatenation remains high priority nullable if both left and right are so. @@ -115,7 +120,9 @@ public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRege startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor), containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable, - containsEffect: left_info.ContainsEffect || right_info.ContainsEffect); + containsEffect: left_info.ContainsEffect || right_info.ContainsEffect, + containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor + ); /// /// Inherits anchor visibility from the loop body. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index cf2cfa196398f..06614dfd34f29 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions.Symbolic diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 0f7bf2c01cd78..f55a8fcd752c2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -84,6 +84,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// TODO: summarize private readonly int _deadStateId; + /// TODO: summarize + private readonly bool _containsAnyAnchor; + /// The initial states for the original pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _initialStates; @@ -230,6 +233,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo // Assign dead state id _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id; + _containsAnyAnchor = _pattern._info.ContainsSomeAnchor; // Create the initial states for the original pattern. var initialStates = new MatchingState[statesCount]; @@ -378,7 +382,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. int matchStartLowBoundary, matchStartLengthMarker; - int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch + int matchEnd = (_pattern._info.ContainsEndZAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch { (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), @@ -418,7 +422,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { Debug.Assert(matchEnd >= startat - 1); matchStart = matchEnd < startat ? - startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch + startat : (_pattern._info.ContainsEndZAnchor, _pattern._info.ContainsSomeAnchor) switch { (true, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), (true, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), @@ -484,7 +488,7 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - _findOpts is null ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + _findOpts is null ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -525,16 +529,16 @@ private int FindEndPosition - private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) - where TStateHandler : struct, IStateHandler - where TInputReader : struct, IInputReader - where TNullabilityHandler : struct, INullabilityHandler + private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length, RegexRunnerMode mode, + ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; + int final = length - 1; + Span mtlookup = _mintermClassifier.Lookup.AsSpan(); int endStateId = endStateIdRef; + int currStateId = startStateId; int initialStatePos = initialStatePosRef; int initialStatePosCandidate = initialStatePosCandidateRef; try @@ -542,32 +546,49 @@ private bool FindEndPositionDeltasDFANoSkip(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + if (_canBeNullableArray[currStateId]) { - endPos = pos; - endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); - initialStatePos = initialStatePosCandidate; - - // A match is known to exist. If that's all we need to know, we're done. - if (mode == RegexRunnerMode.ExistenceRequired) + if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId))) { - return true; + endPos = pos; + endStateId = currStateId; + initialStatePos = initialStatePosCandidate; + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) + { + return true; + } } } // If there is more input available try to transition with the next character. - if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) + if (pos >= final || !DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)) { - return false; + if (pos < final) + { + return false; + } + // one off check for the final position + // this is just to move it out of the hot loop + if ((!_stateFlagsArray[currStateId].IsNullable() && + !_stateArray[currStateId]!.IsNullableFor( + GetPositionKind(positionId)))) + { + return false; + } + endPos = pos; + endStateId = currStateId; + initialStatePos = initialStatePosCandidate; + return mode == RegexRunnerMode.ExistenceRequired; } // We successfully transitioned, so update our current input index to match. @@ -576,6 +597,8 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan(this, - in currentState, positionId, + in currentState, TInputReader.GetPositionId(this, input, i), DfaStateHandler.GetStateFlags(this, in currentState))) { lastStart = i; @@ -1239,6 +1261,36 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur return false; } + /// Take the transition to the next DFA state without paying for the NFA structure + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, + int mintermId) + { + Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}."); + // Use the mintermId for the character being read to look up which state to transition to. + // If that state has already been materialized, move to it, and we're done. If that state + // hasn't been materialized, try to create it; if we can, move to it, and we're done. + int nextStateId = matcher._dfaDelta[matcher.DeltaOffset(state, mintermId)]; + if (nextStateId > 0) + { + // There was an existing DFA transition to some state. Move to it and + // return that we're still operating as a DFA and can keep going. + state = nextStateId; + return true; + } + + if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId, + matcher.DeltaOffset(state, mintermId), + checkThreshold: true, out MatchingState? nextState)) + { + // We were able to create a new DFA transition to some state. Move to it and + // return that we're still operating as a DFA and can keep going. + state = nextState.Id; + return true; + } + return false; + } + /// /// Gets context independent state information: /// - whether this is an initial state diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index a138c819be00f..4309054c354e6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -396,7 +396,8 @@ SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or - SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor)); + SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor, + kind is SymbolicRegexNodeKind.EndAnchorZ)); } #endregion From 627fd9099ac66b10f28cd1fff15ba697827ab829 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 27 May 2024 00:58:10 +0300 Subject: [PATCH 05/63] handle final position correctly --- .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index f55a8fcd752c2..84f31bec2028a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -488,7 +488,7 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - _findOpts is null ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + _findOpts is null && pos < input.Length - 1 ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -561,7 +561,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length endPos = pos; endStateId = currStateId; initialStatePos = initialStatePosCandidate; - // A match is known to exist. If that's all we need to know, we're done. if (mode == RegexRunnerMode.ExistenceRequired) { @@ -597,8 +596,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length } finally { - // handle final pos here - // Write back the local copies of the ref values. posRef = pos; endPosRef = endPos; From 7ae644012f8d058160a2e062ffbf273d59aa0b27 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 27 May 2024 17:07:02 +0300 Subject: [PATCH 06/63] edge case workarounds, tests should be ok again --- .../Symbolic/SymbolicRegexMatcher.cs | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 84f31bec2028a..99d34fba06ace 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -87,6 +87,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// TODO: summarize private readonly bool _containsAnyAnchor; + /// TODO: summarize + private readonly bool _containsEndZAnchor; + /// The initial states for the original pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _initialStates; @@ -233,7 +236,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo // Assign dead state id _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id; + + // Assign edge case info for quick lookup _containsAnyAnchor = _pattern._info.ContainsSomeAnchor; + _containsEndZAnchor = _pattern._info.ContainsEndZAnchor; // Create the initial states for the original pattern. var initialStates = new MatchingState[statesCount]; @@ -488,7 +494,8 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - _findOpts is null && pos < input.Length - 1 ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + // If there are no edge cases then use the quicker loop + (_findOpts is null && !_containsEndZAnchor && pos < input.Length - 1) ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -570,20 +577,28 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length } // If there is more input available try to transition with the next character. - if (pos >= final || !DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)) + // Note: the order here is important so the transition gets taken + if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= final) { + // _wout($"end1: {_stateArray[currStateId]}"); if (pos < final) { return false; } + pos++; + // _wout($"end: {_stateArray[currStateId]}"); + // final transition + // DfaStateHandler.TryTakeDFATransition(this, ref currStateId, -1); + // // one off check for the final position // this is just to move it out of the hot loop if ((!_stateFlagsArray[currStateId].IsNullable() && !_stateArray[currStateId]!.IsNullableFor( - GetPositionKind(positionId)))) + GetPositionKind(-1)))) { return false; } + // the end position (-1) was nullable endPos = pos; endStateId = currStateId; initialStatePos = initialStatePosCandidate; From 383f3e5fb4c58bd455d622528c842fa5a1dc3ecd Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 27 May 2024 18:29:38 +0300 Subject: [PATCH 07/63] optimizing lookup initialization --- .../Symbolic/MintermClassifier.cs | 16 +++-- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 65 ++++++++++--------- .../Symbolic/SymbolicRegexThresholds.cs | 1 + .../tests/UnitTests/SymbolicRegexTests.cs | 14 ++++ 4 files changed, 62 insertions(+), 34 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 3810f35f69f84..799814ee7b9c2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -65,12 +65,18 @@ public MintermClassifier(BDD[] minterms, CharSetSolver solver) anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId); } - // TODO: this could be initialized more efficiently but it's - // a fundamentally different design choice that preallocates more memory. - // the minterm slice [1..] contains the ranges that should be really initialized - for (int i = 0; i <= ushort.MaxValue; i++) + // assign minterm category for every char + // unused characters in minterm 0 get mapped to zero + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - lookup[i] = anyCharacterToMintermId.Find(i); + // precompute all assigned minterm categories + (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); + foreach ((uint start, uint end) in mintermRanges) + { + // assign character ranges in bulk + Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); + slice.Fill(mintermId); + } } _lookup = lookup; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 06614dfd34f29..a6f86e09ffacf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -173,43 +173,50 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint } /// - /// Optimized reversal state computation which takes skips the fixed length parts + /// Optimized reversal state computation during construction which + /// skips the fixed length parts of reversal + /// e.g. for the pattern abc.*def + /// 1) the end is found at abc.*def| + /// 2) the reversal starts at abc.*| /// - /// - /// + /// reversed initial pattern + /// returns n of chars to skip and adjusted reversal start state private (int, MatchingState) CreateOptimizedReversal(SymbolicRegexNode node) { - var pos = 0; - var current = node; - var canLoop = true; - var incrPos = new Func<(int, SymbolicRegexNode), (bool, SymbolicRegexNode)>(value => + int pos = 0; + SymbolicRegexNode? current = node; + bool canLoop = true; + var addSingleton = new Func, (bool, SymbolicRegexNode)>(concatNode => { - pos += value.Item1; - return (true, value.Item2); + pos += 1; + // continue with next concat + return (true, concatNode._right!); }); - var decrLoop = new Func, (bool, SymbolicRegexNode)>(value => + var addFixedLengthLoop = new Func, (bool, SymbolicRegexNode)>(concatNode => { - var concat = value; - var loop = concat._left; - switch (loop!._left!.Kind) + SymbolicRegexNode? loopNode = concatNode._left; + if (loopNode is { _lower: <= 0 }) + { + return (false, concatNode); + } + switch (loopNode!._left!.Kind) { case SymbolicRegexNodeKind.Singleton: - if (loop._lower == loop._upper) - { - pos += loop._lower; - return (true, concat._right!); - } - if (loop._lower > 0) + + if (loopNode._lower == loopNode._upper) { - var delta = loop._upper - loop._lower; - var newLeft = _builder.CreateLoop(loop._left, loop.IsLazy, 0, delta); - var newNode = _builder.CreateConcat(newLeft, concat._right!); - pos += loop._lower; - return (true, newNode); + pos += loopNode._lower; + // the entire loop is fixed, continue + return (true, concatNode._right!); } - return (false, concat); + // subtract the fixed part of the loop + int loopRemainder = loopNode._upper - loopNode._lower; + SymbolicRegexNode newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); + SymbolicRegexNode newNode = _builder.CreateConcat(newLeft, concatNode._right!); + pos += loopNode._lower; + return (true, newNode); default: - return (false, concat); + return (false, concatNode); } }); while (canLoop) @@ -224,15 +231,15 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!), {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => - incrPos((1, current._right!)), + addSingleton(current), {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => - decrLoop(current), + addFixedLengthLoop(current), _ => (false, current) }; canLoop = loop; current = next; } - return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0, false)); + return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index c0118d52553ff..b00f2631c3aa2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -33,6 +33,7 @@ internal static class SymbolicRegexThresholds /// /// Default maximum estimated safe expansion size of a AST /// after the AST has been anlayzed for safe handling. + /// TODO: this is perhaps too conservative, consider raising this /// /// If the AST exceeds this threshold then is thrown. /// This default value may be overridden with the AppContext data diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index 0e7046a04f36d..cbddba878edc2 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -253,5 +253,19 @@ public void SafeThresholdConfigTest(object? newThresholdData, int expectedThresh AppContext.SetData(SymbolicRegexThresholds.SymbolicRegexSafeSizeThreshold_ConfigKeyName, null); Assert.Equal(expectedThreshold, k); } + + [Fact] + public static void OptimizedReversalTests() + { + var charSetSolver = new CharSetSolver(); + var bddBuilder = new SymbolicRegexBuilder(charSetSolver, charSetSolver); + var converter = new RegexNodeConverter(bddBuilder, null); + const RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture; + RegexNode tree = RegexParser.Parse("abc.*def", options, CultureInfo.CurrentCulture).Root; + SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); + // todo: import the matcher here or use something else? + // var matcher = SymbolicRegexMatcher.Create(bddBuilder, rootNode, 0, null, TimeSpan.MaxValue); + + } } } From 5a2636c9248d9bf115056c3cca255083e688afad Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 28 May 2024 04:08:23 +0300 Subject: [PATCH 08/63] more dfa overhead removed --- .../Symbolic/BitVectorSolver.cs | 4 +- .../Symbolic/MatchingState.cs | 17 ++++ .../Symbolic/MintermClassifier.cs | 34 +------- .../Symbolic/RegexNodeConverter.cs | 84 +++++++++++++++++++ .../Symbolic/SymbolicRegexMatcher.cs | 48 ++++++----- .../Symbolic/SymbolicRegexRunnerFactory.cs | 5 +- .../Symbolic/UInt64Solver.cs | 4 +- 7 files changed, 140 insertions(+), 56 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs index 09db2948d717b..b30527871e2bb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs @@ -10,11 +10,11 @@ internal sealed class BitVectorSolver : ISolver internal readonly MintermClassifier _classifier; private readonly BitVector[] _mintermVectors; - public BitVectorSolver(BDD[] minterms, CharSetSolver solver) + public BitVectorSolver(BDD[] minterms) { _minterms = minterms; - _classifier = new MintermClassifier(minterms, solver); + _classifier = new MintermClassifier(minterms); var singleBitVectors = new BitVector[minterms.Length]; for (int i = 0; i < singleBitVectors.Length; i++) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index da7128b464da5..9624e0fd143bd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -14,7 +14,18 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) { Node = node; PrevCharKind = prevCharKind; + // this is significantly cheaper to initialize once + // than to pay for it on every call + if (Node.CanBeNullable) + { + _nullabilityLookup = new bool[5]; + for (uint nk = 0; nk <= 4; nk++) + { + _nullabilityLookup[nk] = IsNullableForInit(nk); + } + } } + private readonly bool[]? _nullabilityLookup; /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -97,6 +108,12 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) + { + return (_nullabilityLookup is not null && _nullabilityLookup[nextCharKind]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsNullableForInit(uint nextCharKind) { Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind)); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 799814ee7b9c2..1132b3881efc4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -20,51 +20,23 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { - /// An array used when there's a single minterm, in order to map every ASCII character to it trivially. - // private static readonly int[] AllAsciiIsZeroMintermArray = new int[128]; + /// An array used to map characters to minterms private readonly int[] _lookup; - // /// A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID. - // /// - // /// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further, - // /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed. - // /// - // private readonly BDD _nonAscii; - /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. - /// The character set solver to use. - public MintermClassifier(BDD[] minterms, CharSetSolver solver) + public MintermClassifier(BDD[] minterms) { Debug.Assert(minterms.Length > 0, "Requires at least"); - var lookup = new int[ushort.MaxValue + 1]; + int[] lookup = new int[ushort.MaxValue + 1]; if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). - // For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0. _lookup = lookup; - // _nonAscii = solver.ReplaceTrue(BDD.True, 0); return; } - // Create a multi-terminal BDD for mapping any character to its associated minterm. - BDD anyCharacterToMintermId = BDD.False; - for (int i = 0; i < minterms.Length; i++) - { - // Each supplied minterm BDD decides whether a given character maps to it or not. - // We need to combine all of those into a multi-terminal BDD that decides which - // minterm a character maps to. To do that, we take each minterm BDD and replace - // its True result with the ID of the minterm, such that a character that would - // have returned True for that BDD now returns the minterm ID. - BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i); - - // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning - // is valid because every character belongs to exactly one minterm and thus will - // only map to an ID instead of False in exactly one of the input BDDs. - anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId); - } - // assign minterm category for every char // unused characters in minterm 0 get mapped to zero for (int mintermId = 1; mintermId < minterms.Length; mintermId++) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 9194ca00c971c..9e6c25f41a3d4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -527,5 +527,89 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code) } } } + + + /// + /// attempt to remove anchors when possible since it reduces overhead + /// more rewrites could be tried but it's important to preserve PCRE semantics + /// + /// + /// + /// + internal static SymbolicRegexNode ApplyRootRewrites(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode) + { + // only consider removing anchors, otherwise bail + if (!rootNode._info.ContainsSomeAnchor) return rootNode; + + // Func _wout = st => + // { + // var a_cons = System.Reflection.Assembly.Load("System.Console"); + // var t_cons = a_cons.GetType("System.Console")!; + // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); + // wl!.Invoke(null, [st]); + // return true; + // }; + + SymbolicRegexNode ApplyRewrites(SymbolicRegexNode node) + { + // Guard against stack overflow due to deep recursion + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node)); + } + + var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver); + + switch (node._kind) + { + case SymbolicRegexNodeKind.Concat: + // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}"); + switch (node._left!._kind) + { + case SymbolicRegexNodeKind.CaptureStart: + return builder.CreateConcat(node._left, ApplyRewrites(node._right!)); + case SymbolicRegexNodeKind.BoundaryAnchor: + return node._right! switch + { + // \b\w{1,}.. -> \w{1,} + // anchor to the left can be removed + { + _kind: SymbolicRegexNodeKind.Concat, _left: + { + _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue + + } wordLoop + } + when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!), + _ => node + }; + case SymbolicRegexNodeKind.Loop: + var loopnode = node._left!; + // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case + bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1; + bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl); + return node._right! switch + { + // anchor to the right can be removed + { + _kind: SymbolicRegexNodeKind.Concat, + _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor, + _right._kind: SymbolicRegexNodeKind.CaptureEnd + } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)), + _ => node + }; + } + return node; + + + default: + return node; + } + } + + SymbolicRegexNode rewritten = ApplyRewrites(rootNode); + // _wout(rewritten.ToString()); + return rewritten; + } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 99d34fba06ace..d2e333b45f0f8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -205,6 +205,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo { var setIsTooCommon = new Func((fds) => { + // _wout($"s{fds.Set}"); + // _wout($"c{fds.Chars.AsSpan()}"); return fds switch { // anything above 4 uint16 chars is generally slower than DFA @@ -225,6 +227,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations, _ => findOptimizations // TODO: unsure which options are left here }; + // _wout($"{findOptimizations.FindMode}"); + // _wout($"o{_findOpts}"); } // Determine the number of initial states. If there's no anchor, only the default previous @@ -488,14 +492,29 @@ private int FindEndPosition CharsPerTimeoutCheck ? pos + CharsPerTimeoutCheck : input.Length; + if (pos == input.Length && currentState.NfaState is null) + { + if ((!_stateFlagsArray[currentState.DfaStateId].IsNullable() && + !_stateArray[currentState.DfaStateId]!.IsNullableFor( + GetPositionKind(-1)))) + { + break; + } + // the end position (-1) was nullable + endPos = pos; + endStateId = currentState.DfaStateId; + break; + } + bool done = currentState.NfaState is not null ? FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : // If there are no edge cases then use the quicker loop - (_findOpts is null && !_containsEndZAnchor && pos < input.Length - 1) ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : + _findOpts is null && !_containsEndZAnchor ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -526,6 +545,8 @@ private int FindEndPosition 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; return endPos; } @@ -536,18 +557,15 @@ private int FindEndPosition - private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; - int final = length - 1; Span mtlookup = _mintermClassifier.Lookup.AsSpan(); int endStateId = endStateIdRef; int currStateId = startStateId; - int initialStatePos = initialStatePosRef; - int initialStatePosCandidate = initialStatePosCandidateRef; try { // Loop through each character in the input, transitioning from state to state for each. @@ -557,17 +575,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length { return true; } - int positionId = mtlookup[input[pos]]; + // int positionId = mtlookup[input[pos]]; // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. if (_canBeNullableArray[currStateId]) { - if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId))) + if (_stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]]))) { endPos = pos; endStateId = currStateId; - initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. if (mode == RegexRunnerMode.ExistenceRequired) { @@ -578,18 +595,13 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // If there is more input available try to transition with the next character. // Note: the order here is important so the transition gets taken - if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= final) + if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]]) || pos >= lengthMinus1) { - // _wout($"end1: {_stateArray[currStateId]}"); - if (pos < final) + pos++; + if (pos < input.Length) { return false; } - pos++; - // _wout($"end: {_stateArray[currStateId]}"); - // final transition - // DfaStateHandler.TryTakeDFATransition(this, ref currStateId, -1); - // // one off check for the final position // this is just to move it out of the hot loop if ((!_stateFlagsArray[currStateId].IsNullable() && @@ -601,7 +613,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // the end position (-1) was nullable endPos = pos; endStateId = currStateId; - initialStatePos = initialStatePosCandidate; return mode == RegexRunnerMode.ExistenceRequired; } @@ -615,8 +626,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length posRef = pos; endPosRef = endPos; endStateIdRef = endStateId; - initialStatePosRef = initialStatePos; - initialStatePosCandidateRef = initialStatePosCandidate; + initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index fea9518b79b51..ecd746ed6de87 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -21,6 +21,7 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root); + rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode); // Determine if the root node is supported for safe handling int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(); @@ -40,8 +41,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim BDD[] minterms = rootNode.ComputeMinterms(bddBuilder); _matcher = minterms.Length > 64 ? - SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) : - SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms, charSetSolver), matchTimeout); + SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms), matchTimeout) : + SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms), matchTimeout); } /// Creates a object. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs index 7664d6d03aa4a..c65c00fd23413 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs @@ -12,12 +12,12 @@ internal sealed class UInt64Solver : ISolver private readonly BDD[] _minterms; internal readonly MintermClassifier _classifier; - public UInt64Solver(BDD[] minterms, CharSetSolver solver) + public UInt64Solver(BDD[] minterms) { Debug.Assert(minterms.Length <= 64); _minterms = minterms; - _classifier = new MintermClassifier(minterms, solver); + _classifier = new MintermClassifier(minterms); Full = minterms.Length == 64 ? ulong.MaxValue : ulong.MaxValue >> (64 - minterms.Length); } From 57e5b8d80c45ffc52b8d04351e6ee256d9081037 Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 28 May 2024 05:00:27 +0300 Subject: [PATCH 09/63] removed potential rewrite --- .../Symbolic/RegexNodeConverter.cs | 165 +++++++++--------- .../Symbolic/SymbolicRegexMatcher.cs | 2 +- .../Symbolic/SymbolicRegexRunnerFactory.cs | 2 +- 3 files changed, 85 insertions(+), 84 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 9e6c25f41a3d4..88fc386b6956e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -529,87 +529,88 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code) } - /// - /// attempt to remove anchors when possible since it reduces overhead - /// more rewrites could be tried but it's important to preserve PCRE semantics - /// - /// - /// - /// - internal static SymbolicRegexNode ApplyRootRewrites(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode) - { - // only consider removing anchors, otherwise bail - if (!rootNode._info.ContainsSomeAnchor) return rootNode; - - // Func _wout = st => - // { - // var a_cons = System.Reflection.Assembly.Load("System.Console"); - // var t_cons = a_cons.GetType("System.Console")!; - // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); - // wl!.Invoke(null, [st]); - // return true; - // }; - - SymbolicRegexNode ApplyRewrites(SymbolicRegexNode node) - { - // Guard against stack overflow due to deep recursion - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node)); - } - - var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver); - - switch (node._kind) - { - case SymbolicRegexNodeKind.Concat: - // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}"); - switch (node._left!._kind) - { - case SymbolicRegexNodeKind.CaptureStart: - return builder.CreateConcat(node._left, ApplyRewrites(node._right!)); - case SymbolicRegexNodeKind.BoundaryAnchor: - return node._right! switch - { - // \b\w{1,}.. -> \w{1,} - // anchor to the left can be removed - { - _kind: SymbolicRegexNodeKind.Concat, _left: - { - _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue - - } wordLoop - } - when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!), - _ => node - }; - case SymbolicRegexNodeKind.Loop: - var loopnode = node._left!; - // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case - bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1; - bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl); - return node._right! switch - { - // anchor to the right can be removed - { - _kind: SymbolicRegexNodeKind.Concat, - _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor, - _right._kind: SymbolicRegexNodeKind.CaptureEnd - } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)), - _ => node - }; - } - return node; - - - default: - return node; - } - } - - SymbolicRegexNode rewritten = ApplyRewrites(rootNode); - // _wout(rewritten.ToString()); - return rewritten; - } + // /// + // /// attempt to remove anchors when possible since it reduces overhead + // /// more rewrites could be tried but it's important to preserve PCRE semantics + // /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner + // /// + // /// + // /// + // /// + // internal static SymbolicRegexNode ApplyRootRewrites(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode) + // { + // // only consider removing anchors, otherwise bail + // if (!rootNode._info.ContainsSomeAnchor) return rootNode; + + // // Func _wout = st => + // // { + // // var a_cons = System.Reflection.Assembly.Load("System.Console"); + // // var t_cons = a_cons.GetType("System.Console")!; + // // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); + // // wl!.Invoke(null, [st]); + // // return true; + // // }; + + // SymbolicRegexNode ApplyRewrites(SymbolicRegexNode node) + // { + // // Guard against stack overflow due to deep recursion + // if (!StackHelper.TryEnsureSufficientExecutionStack()) + // { + // return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node)); + // } + + // var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver); + + // switch (node._kind) + // { + // case SymbolicRegexNodeKind.Concat: + // // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}"); + // switch (node._left!._kind) + // { + // case SymbolicRegexNodeKind.CaptureStart: + // return builder.CreateConcat(node._left, ApplyRewrites(node._right!)); + // case SymbolicRegexNodeKind.BoundaryAnchor: + // return node._right! switch + // { + // // \b\w{1,}.. -> \w{1,} + // // anchor to the left can be removed + // { + // _kind: SymbolicRegexNodeKind.Concat, _left: + // { + // _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue + + // } wordLoop + // } + // when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!), + // _ => node + // }; + // case SymbolicRegexNodeKind.Loop: + // var loopnode = node._left!; + // // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case + // bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1; + // bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl); + // return node._right! switch + // { + // // anchor to the right can be removed + // { + // _kind: SymbolicRegexNodeKind.Concat, + // _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor, + // _right._kind: SymbolicRegexNodeKind.CaptureEnd + // } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)), + // _ => node + // }; + // } + // return node; + + + // default: + // return node; + // } + // } + + // SymbolicRegexNode rewritten = ApplyRewrites(rootNode); + // // _wout(rewritten.ToString()); + // return rewritten; + // } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index d2e333b45f0f8..303fa200e1819 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -581,7 +581,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // we found a potential end state. if (_canBeNullableArray[currStateId]) { - if (_stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]]))) + if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]]))) { endPos = pos; endStateId = currStateId; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index ecd746ed6de87..c046531f8a295 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -21,7 +21,7 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root); - rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode); + // rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode); // Determine if the root node is supported for safe handling int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(); From 4d275dbf512ac2f83630d443458da9f1936a0153 Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 28 May 2024 16:12:20 +0300 Subject: [PATCH 10/63] low memory variant --- .../Symbolic/MintermClassifier.cs | 30 +++++++++++++++++-- .../Symbolic/SymbolicRegexMatcher.cs | 8 +++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 1132b3881efc4..eceef93abfe1b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -20,8 +20,10 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { + private static readonly int[] s_emptyLookup = new int[ushort.MaxValue + 1]; /// An array used to map characters to minterms private readonly int[] _lookup; + private readonly bool _isAsciiOnly; /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. @@ -29,16 +31,30 @@ public MintermClassifier(BDD[] minterms) { Debug.Assert(minterms.Length > 0, "Requires at least"); - int[] lookup = new int[ushort.MaxValue + 1]; + if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). - _lookup = lookup; + _lookup = s_emptyLookup; return; } + // low memory variant could create an ascii-only array + // cheaper to iterate twice than allocate an array and potentially not use it + _isAsciiOnly = true; + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + { + (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); + if (mintermRanges[^1].Item2 >= 128) + { + _isAsciiOnly = false; + } + } + + // assign minterm category for every char // unused characters in minterm 0 get mapped to zero + int[] lookup = new int[_isAsciiOnly ? 128 : 65536]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { // precompute all assigned minterm categories @@ -57,8 +73,16 @@ public MintermClassifier(BDD[] minterms) [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { + if (_isAsciiOnly && (c >= 128)) + { + return 0; + } + // high performance variant would use a span directly + // but this is not possible in low memory constraints return _lookup[c]; } - public int[] Lookup => _lookup; + + // [MethodImpl(MethodImplOptions.AggressiveInlining)] + // public Span LookupSpan() => _lookup.AsSpan(); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 303fa200e1819..e08bd315819f1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -563,7 +563,8 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; - Span mtlookup = _mintermClassifier.Lookup.AsSpan(); + // can only be used with full array + // Span mtlookup = _mintermClassifier.Lookup.AsSpan(); int endStateId = endStateIdRef; int currStateId = startStateId; try @@ -576,12 +577,13 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length return true; } // int positionId = mtlookup[input[pos]]; + int positionId = _mintermClassifier.GetMintermID(input[pos]); // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. if (_canBeNullableArray[currStateId]) { - if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]]))) + if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId))) { endPos = pos; endStateId = currStateId; @@ -595,7 +597,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // If there is more input available try to transition with the next character. // Note: the order here is important so the transition gets taken - if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]]) || pos >= lengthMinus1) + if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= lengthMinus1) { pos++; if (pos < input.Length) From c35ed7e9be5756f4d02aa29b8df0ef137c5b10f0 Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 28 May 2024 21:12:46 +0300 Subject: [PATCH 11/63] some kind of compromise between speed and memory --- .../Symbolic/MintermClassifier.cs | 73 ++++++++--- .../Symbolic/SymbolicRegexMatcher.cs | 121 ++++++++++++++++-- 2 files changed, 164 insertions(+), 30 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index eceef93abfe1b..a2ea8dcaeb904 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -20,11 +20,19 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { - private static readonly int[] s_emptyLookup = new int[ushort.MaxValue + 1]; + private static readonly byte[] s_emptyLookup = new byte[ushort.MaxValue + 1]; /// An array used to map characters to minterms - private readonly int[] _lookup; + private readonly byte[]? _lookup; + + /// Conserve memory if pattern is ascii-only private readonly bool _isAsciiOnly; + /// + /// fallback lookup if over 255 minterms + /// this is almost never used + /// + private readonly int[]? _intLookup; + /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. public MintermClassifier(BDD[] minterms) @@ -39,8 +47,9 @@ public MintermClassifier(BDD[] minterms) return; } - // low memory variant could create an ascii-only array - // cheaper to iterate twice than allocate an array and potentially not use it + // low memory variant is to create an ascii-only array + // this adds indirection to the hot loop which costs performance + // and only exists because the wasm tests fail with OOM _isAsciiOnly = true; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { @@ -51,25 +60,44 @@ public MintermClassifier(BDD[] minterms) } } - // assign minterm category for every char // unused characters in minterm 0 get mapped to zero - int[] lookup = new int[_isAsciiOnly ? 128 : 65536]; - for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + if (minterms.Length > 255) { - // precompute all assigned minterm categories - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - foreach ((uint start, uint end) in mintermRanges) + // over 255 unique sets also means it's never ascii only + int[] lookup = new int[ushort.MaxValue + 1]; + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - // assign character ranges in bulk - Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - slice.Fill(mintermId); + // precompute all assigned minterm categories + (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); + foreach ((uint start, uint end) in mintermRanges) + { + // assign character ranges in bulk + Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); + slice.Fill(mintermId); + } } + _intLookup = lookup; + } + else + { + byte[] lookup = new byte[_isAsciiOnly ? 128 : ushort.MaxValue + 1]; + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + { + // precompute all assigned minterm categories + (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); + foreach ((uint start, uint end) in mintermRanges) + { + // assign character ranges in bulk + Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); + slice.Fill((byte)mintermId); + } + } + _lookup = lookup; } - _lookup = lookup; } - /// Gets the ID of the minterm associated with the specified character. + // /// Gets the ID of the minterm associated with the specified character. [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { @@ -79,10 +107,19 @@ public int GetMintermID(int c) } // high performance variant would use a span directly // but this is not possible in low memory constraints - return _lookup[c]; + // additional memory is saved by using a byte + return _intLookup is null ? _lookup![c] : _intLookup[c]; } - // [MethodImpl(MethodImplOptions.AggressiveInlining)] - // public Span LookupSpan() => _lookup.AsSpan(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsAsciiOnly() => _isAsciiOnly; + + /// + /// Can be null if there is over 255 minterms + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public byte[]? ByteLookup() => _lookup; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index e08bd315819f1..298012032e9b5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -511,11 +511,31 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - // If there are no edge cases then use the quicker loop - _findOpts is null && !_containsEndZAnchor ? FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + bool done; + if (currentState.NfaState is not null) + // nfa fallback check + done = FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, + ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null) + { + done = _mintermClassifier.IsAsciiOnly() + ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1, + mode, ref pos, + currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, + ref initialStatePosCandidate) + // if there are no edge cases then use the quicker loop + : FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, + currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, + ref initialStatePosCandidate); + } + else + { + // dfa loop with potential skipping + done = FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, + ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -551,6 +571,85 @@ private int FindEndPosition + /// Ascii-only variant of the hot loop to conserve memory + /// + private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, + ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + { + // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. + int pos = posRef; + int endPos = endPosRef; + // can only be used with full array initialized and <= 255 minterms + byte[] mtlookup = _mintermClassifier.ByteLookup()!; + int endStateId = endStateIdRef; + int currStateId = startStateId; + try + { + // Loop through each character in the input, transitioning from state to state for each. + while (true) + { + if (currStateId == _deadStateId) + { + return true; + } + + int c = input[pos]; + int positionId = c >= 128 ? 0 : mtlookup[c]; + + // If the state is nullable for the next character, meaning it accepts the empty string, + // we found a potential end state. + if (_canBeNullableArray[currStateId]) + { + if (_stateFlagsArray[currStateId].IsNullable() + || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId))) + { + endPos = pos; + endStateId = currStateId; + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) + { + return true; + } + } + } + + // If there is more input available try to transition with the next character. + // Note: the order here is important so the transition gets taken + if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)|| pos >= lengthMinus1) + { + pos++; + if (pos < input.Length) + { + return false; + } + // one off check for the final position + // this is just to move it out of the hot loop + if ((!_stateFlagsArray[currStateId].IsNullable() && + !_stateArray[currStateId]!.IsNullableFor( + GetPositionKind(-1)))) + { + return false; + } + // the end position (-1) was nullable + endPos = pos; + endStateId = currStateId; + return mode == RegexRunnerMode.ExistenceRequired; + } + + // We successfully transitioned, so update our current input index to match. + pos++; + } + } + finally + { + // Write back the local copies of the ref values. + posRef = pos; + endPosRef = endPos; + endStateIdRef = endStateId; + initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; + } + } /// /// TODO: this is essentially a stripped down version when there's no good prefix optimizations @@ -563,8 +662,8 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; - // can only be used with full array - // Span mtlookup = _mintermClassifier.Lookup.AsSpan(); + // can only be used with full array initialized and <= 255 minterms + byte[] mtlookup = _mintermClassifier.ByteLookup()!; int endStateId = endStateIdRef; int currStateId = startStateId; try @@ -576,14 +675,12 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length { return true; } - // int positionId = mtlookup[input[pos]]; - int positionId = _mintermClassifier.GetMintermID(input[pos]); - // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. if (_canBeNullableArray[currStateId]) { - if (_stateFlagsArray[currStateId].IsNullable() || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId))) + if (_stateFlagsArray[currStateId].IsNullable() + || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]]))) { endPos = pos; endStateId = currStateId; @@ -597,7 +694,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // If there is more input available try to transition with the next character. // Note: the order here is important so the transition gets taken - if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId) || pos >= lengthMinus1) + if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]])|| pos >= lengthMinus1) { pos++; if (pos < input.Length) From 868e02d955f200d892f2360c99be1cc700471719 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 29 May 2024 19:43:20 +0300 Subject: [PATCH 12/63] cheaper nullability checks --- .../Symbolic/MatchingState.cs | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 9624e0fd143bd..4094dfec19ccb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -18,14 +18,23 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) // than to pay for it on every call if (Node.CanBeNullable) { - _nullabilityLookup = new bool[5]; - for (uint nk = 0; nk <= 4; nk++) + for (uint ck = 0; ck < CharKind.CharKindCount; ck++) { - _nullabilityLookup[nk] = IsNullableForInit(nk); + _nullabilityLookup |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0); } } } - private readonly bool[]? _nullabilityLookup; + /// + /// todo: change this to flags later + /// nullability for each context encoded in a bit + /// 0 means node cannot be nullable + /// 00001 -> nullable for General + /// 00010 -> nullable for BeginningEnd + /// 00100 -> nullable for NewLine + /// 01000 -> nullable for NewLineS + /// 10000 -> nullable for WordLetter + /// + private readonly byte _nullabilityLookup; // redundant but added for clarity /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -106,12 +115,18 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m return Node.CreateNfaDerivativeWithEffects(builder, minterm, context); } + /// + /// Bit encoded nullability check for the hot loop + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) { - return (_nullabilityLookup is not null && _nullabilityLookup[nextCharKind]); + return (nextCharKind & _nullabilityLookup) > 0; } + /// + /// Full nullability check for initialization + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableForInit(uint nextCharKind) { From 14afd188bab372ccf2ba9f219213962806642206 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 29 May 2024 19:51:22 +0300 Subject: [PATCH 13/63] nullability encoding --- .../Text/RegularExpressions/Symbolic/MatchingState.cs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 4094dfec19ccb..13c8f3900bb02 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -26,6 +26,9 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) } /// /// todo: change this to flags later + /// i think the clr assigns an entire class field for this + /// so this should be placed in an array as well + /// -- /// nullability for each context encoded in a bit /// 0 means node cannot be nullable /// 00001 -> nullable for General @@ -34,7 +37,7 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) /// 01000 -> nullable for NewLineS /// 10000 -> nullable for WordLetter /// - private readonly byte _nullabilityLookup; // redundant but added for clarity + private readonly byte _nullabilityLookup; /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -121,7 +124,7 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) { - return (nextCharKind & _nullabilityLookup) > 0; + return ((nextCharKind + 1) & _nullabilityLookup) > 0; } /// From 5f5ab5523d880086382eeb0edc5e7d42399f8fa1 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 29 May 2024 21:44:17 +0300 Subject: [PATCH 14/63] nullability cached as bytes --- .../src/System.Text.RegularExpressions.csproj | 2 +- .../Symbolic/MatchingState.cs | 62 ++++++++++++------- .../Symbolic/MintermClassifier.cs | 17 ++--- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 24 ++++++- .../Symbolic/SymbolicRegexMatcher.cs | 52 ++++++++-------- 5 files changed, 97 insertions(+), 60 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index a6f7119d2fd2f..6fbc17722a774 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,7 +5,7 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 13c8f3900bb02..941a5f76ea27e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -14,30 +14,17 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) { Node = node; PrevCharKind = prevCharKind; - // this is significantly cheaper to initialize once - // than to pay for it on every call - if (Node.CanBeNullable) - { - for (uint ck = 0; ck < CharKind.CharKindCount; ck++) - { - _nullabilityLookup |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0); - } - } + NullabilityInfo = BuildNullabilityInfo(); } + /// - /// todo: change this to flags later - /// i think the clr assigns an entire class field for this - /// so this should be placed in an array as well - /// -- - /// nullability for each context encoded in a bit - /// 0 means node cannot be nullable - /// 00001 -> nullable for General - /// 00010 -> nullable for BeginningEnd - /// 00100 -> nullable for NewLine - /// 01000 -> nullable for NewLineS - /// 10000 -> nullable for WordLetter + /// TODO: The CLR assigns an entire field for this byte which is a waste, + /// and the much more preferred way to use this is in _nullabilityArray in the matcher + /// but the current design relies on interfaces/flags and + /// using the MatchingState directly so this byte is a quick solution to cheapen + /// it there by ~30% as well without having to breaking it all to pieces /// - private readonly byte _nullabilityLookup; + internal readonly int NullabilityInfo; /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -119,12 +106,15 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m } /// - /// Bit encoded nullability check for the hot loop + /// TODO: This method should really never be used and + /// is only used to speed up the existing architecture. + /// Use + /// whereever possible /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) { - return ((nextCharKind + 1) & _nullabilityLookup) > 0; + return ((1 << (int)nextCharKind) & NullabilityInfo) != 0; } /// @@ -168,6 +158,32 @@ internal StateFlags BuildStateFlags(bool isInitial) return info; } + /// + /// nullability for each context is encoded in a bit + /// 0 means node cannot be nullable + /// 00001 -> nullable for General + /// 00010 -> nullable for BeginningEnd + /// 00100 -> nullable for NewLine + /// 01000 -> nullable for NewLineS + /// 10000 -> nullable for WordLetter + /// todo: change to flags later + /// + /// + internal byte BuildNullabilityInfo() + { + byte nullabilityInfo = 0; + // this is significantly cheaper to initialize once + // than to pay for it on every call + if (Node.CanBeNullable) + { + for (uint ck = 0; ck < CharKind.CharKindCount; ck++) + { + nullabilityInfo |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0); + } + } + return nullabilityInfo; + } + public override bool Equals(object? obj) => obj is MatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index a2ea8dcaeb904..1f852d16cee2c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -47,8 +47,8 @@ public MintermClassifier(BDD[] minterms) return; } - // low memory variant is to create an ascii-only array - // this adds indirection to the hot loop which costs performance + // low memory compromise is to create an ascii-only array + // int mintermId = c >= 128 ? 0 : mtlookup[c]; // and only exists because the wasm tests fail with OOM _isAsciiOnly = true; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) @@ -97,7 +97,7 @@ public MintermClassifier(BDD[] minterms) } } - // /// Gets the ID of the minterm associated with the specified character. + /// Gets the ID of the minterm associated with the specified character. [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { @@ -105,18 +105,21 @@ public int GetMintermID(int c) { return 0; } - // high performance variant would use a span directly - // but this is not possible in low memory constraints + // high performance variant would use a span directly. // additional memory is saved by using a byte return _intLookup is null ? _lookup![c] : _intLookup[c]; } - + /// + /// Whether to use the low memory ascii-only hot loop or the full loop + /// + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool IsAsciiOnly() => _isAsciiOnly; /// - /// Can be null if there is over 255 minterms + /// Quick mapping from char to minterm, + /// can be null if there is over 255 minterms /// /// [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index a6f86e09ffacf..60e8712298bfb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -42,9 +42,17 @@ internal sealed partial class SymbolicRegexMatcher private StateFlags[] _stateFlagsArray; /// + /// important: the pattern must not contain endZ for this to be valid. /// Used to short-circuit nullability in the hot loop + /// nullability for each context is encoded in a bit + /// 0 means node cannot be nullable + /// 00001 -> nullable for General + /// 00010 -> nullable for BeginningEnd + /// 00100 -> nullable for NewLine + /// 01000 -> nullable for NewLineS + /// 10000 -> nullable for WordLetter /// - private bool[] _canBeNullableArray; + private byte[] _nullabilityArray; /// /// Used to short-circuit accelerated states in the hot loop @@ -127,6 +135,16 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId; + /// + /// Pre-computed hot-loop version of nullability check + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsNullableWithContext(int stateId, int mintermId) => + ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0; + /// Returns the span from that may contain transitions for the given state private Span GetDeltasFor(MatchingState state) { @@ -268,12 +286,12 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node ArrayResizeAndVolatilePublish(ref _stateArray, newsize); ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog); ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize); - ArrayResizeAndVolatilePublish(ref _canBeNullableArray, newsize); + ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize); ArrayResizeAndVolatilePublish(ref _canBeAcceleratedArray, newsize); } _stateArray[state.Id] = state; _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState); - _canBeNullableArray[state.Id] = _stateFlagsArray[state.Id].CanBeNullable(); + _nullabilityArray[state.Id] = state.BuildNullabilityInfo(); _canBeAcceleratedArray[state.Id] = _stateFlagsArray[state.Id].IsAccelerated(); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 298012032e9b5..343256513bdcd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -81,13 +81,13 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Data and routines for skipping ahead to the next place a match could potentially start. private readonly RegexFindOptimizations? _findOpts; - /// TODO: summarize + /// Dead end state to quickly return NoMatch private readonly int _deadStateId; - /// TODO: summarize + /// Whether the pattern contains any anchor private readonly bool _containsAnyAnchor; - /// TODO: summarize + /// Whether the pattern contains the EndZ anchor which makes most optimizations invalid private readonly bool _containsEndZAnchor; /// The initial states for the original pattern, keyed off of the previous character kind. @@ -184,7 +184,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo // Initialization for fields in SymbolicRegexMatcher.Automata.cs _stateArray = new MatchingState[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; - _canBeNullableArray = new bool[InitialDfaStateCapacity]; + _nullabilityArray = new byte[InitialDfaStateCapacity]; _canBeAcceleratedArray = new bool[InitialDfaStateCapacity]; _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog]; @@ -519,13 +519,15 @@ private int FindEndPosition - /// Ascii-only variant of the hot loop to conserve memory + /// Ascii-only variant of the hot loop to conserve memory. + /// Only major difference is the minterm lookup: + /// `int positionId = c >= 128 ? 0 : mtlookup[c]`; /// private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) @@ -597,12 +601,10 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l int c = input[pos]; int positionId = c >= 128 ? 0 : mtlookup[c]; - // If the state is nullable for the next character, meaning it accepts the empty string, - // we found a potential end state. - if (_canBeNullableArray[currStateId]) + // If the state is nullable for the next character we found a potential end state. + // note: the double array lookup is important here, storing a local variable is expensive + if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, positionId)) { - if (_stateFlagsArray[currStateId].IsNullable() - || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(positionId))) { endPos = pos; endStateId = currStateId; @@ -675,12 +677,10 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length { return true; } - // If the state is nullable for the next character, meaning it accepts the empty string, - // we found a potential end state. - if (_canBeNullableArray[currStateId]) + // If the state is nullable for the next character, we found a potential end state. + // note: the double array lookup is important here, storing a local variable is expensive + if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]])) { - if (_stateFlagsArray[currStateId].IsNullable() - || _stateArray[currStateId]!.IsNullableFor(GetPositionKind(mtlookup[input[pos]]))) { endPos = pos; endStateId = currStateId; @@ -770,10 +770,6 @@ private bool FindEndPositionDeltasDFA(this, input, ref state, ref pos)) @@ -787,7 +783,7 @@ private bool FindEndPositionDeltasDFA(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { endPos = pos; endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); @@ -859,6 +855,9 @@ private bool FindEndPositionDeltasNFA(ReadOnlySpan 0) { if (TNullabilityHandler.IsNullableAt(this, in currentState, TInputReader.GetPositionId(this, input, i), @@ -996,7 +995,7 @@ private bool FindStartPositionDeltasDFA(this, in state, positionId, + if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { lastStart = pos; @@ -1341,7 +1340,8 @@ private interface IStateHandler public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind); + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, + uint nextCharKind) => matcher._nullabilityArray[state.DfaStateId] > 0 && ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0; /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1708,7 +1708,7 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche where TStateHandler : struct, IStateHandler { Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); - return flags.IsNullable(); + return matcher.IsNullableWithContext(state.DfaStateId, positionId); } } From dd121de495123fcfb7cc1424532873287d2169fe Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 30 May 2024 22:16:52 +0300 Subject: [PATCH 15/63] reverting some changes --- .../Symbolic/MatchingState.cs | 10 ++-------- .../Symbolic/MintermClassifier.cs | 7 +++---- .../Symbolic/RegexNodeConverter.cs | 2 +- .../Symbolic/SymbolicRegexMatcher.cs | 18 ++++++++++++++++-- .../Symbolic/SymbolicRegexThresholds.cs | 4 ++-- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 941a5f76ea27e..5d9c66dbb07e0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -18,11 +18,7 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) } /// - /// TODO: The CLR assigns an entire field for this byte which is a waste, - /// and the much more preferred way to use this is in _nullabilityArray in the matcher - /// but the current design relies on interfaces/flags and - /// using the MatchingState directly so this byte is a quick solution to cheapen - /// it there by ~30% as well without having to breaking it all to pieces + /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor /// internal readonly int NullabilityInfo; @@ -106,8 +102,7 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m } /// - /// TODO: This method should really never be used and - /// is only used to speed up the existing architecture. + /// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned /// Use /// whereever possible /// @@ -166,7 +161,6 @@ internal StateFlags BuildStateFlags(bool isInitial) /// 00100 -> nullable for NewLine /// 01000 -> nullable for NewLineS /// 10000 -> nullable for WordLetter - /// todo: change to flags later /// /// internal byte BuildNullabilityInfo() diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 1f852d16cee2c..83cd14daf30d2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -47,9 +47,8 @@ public MintermClassifier(BDD[] minterms) return; } - // low memory compromise is to create an ascii-only array + // ascii-only array to save memory // int mintermId = c >= 128 ? 0 : mtlookup[c]; - // and only exists because the wasm tests fail with OOM _isAsciiOnly = true; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { @@ -60,8 +59,8 @@ public MintermClassifier(BDD[] minterms) } } - // assign minterm category for every char - // unused characters in minterm 0 get mapped to zero + // i have never seen a regex use over 80 minterms not to speak of 255, + // but it's there as a fallback mechanism if (minterms.Length > 255) { // over 255 unique sets also means it's never ascii only diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 88fc386b6956e..31f01271d558b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -532,7 +532,7 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code) // /// // /// attempt to remove anchors when possible since it reduces overhead // /// more rewrites could be tried but it's important to preserve PCRE semantics - // /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner + // /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner // /// // /// // /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 343256513bdcd..258c0ab72ae46 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -81,7 +81,7 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Data and routines for skipping ahead to the next place a match could potentially start. private readonly RegexFindOptimizations? _findOpts; - /// Dead end state to quickly return NoMatch + /// Dead end state to quickly return NoMatch, this could potentially be a constant private readonly int _deadStateId; /// Whether the pattern contains any anchor @@ -102,6 +102,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _reverseInitialStates; + /// + /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state. + /// private readonly (int, MatchingState) _optimizedReversalState; /// Partition of the input space of sets. @@ -328,6 +331,8 @@ uint CalculateMintermIdKind(int mintermId) /// internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize); + /// TODO: when you're calling a function millions of times per second even this add 1 does cost something + /// this should be ideally remapped /// Look up what is the character kind given a position ID [MethodImpl(MethodImplOptions.AggressiveInlining)] private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1]; @@ -351,6 +356,7 @@ internal TSet GetMintermFromId(int mintermId) return minterms[mintermId]; } + /// TODO: this if-else branch could be called once. it's currently causing overhead on every single step [MethodImpl(MethodImplOptions.AggressiveInlining)] private uint GetCharKind(ReadOnlySpan input, int i) where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ? @@ -657,6 +663,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l /// TODO: this is essentially a stripped down version when there's no good prefix optimizations /// i don't trust the compiler to optimize this and it makes a /// ~50% difference in performance with removing unnecessary checks alone + /// /// private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) @@ -668,9 +675,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length byte[] mtlookup = _mintermClassifier.ByteLookup()!; int endStateId = endStateIdRef; int currStateId = startStateId; + // ldfld only once + // int deadStateId = _deadStateId; try { // Loop through each character in the input, transitioning from state to state for each. + // The goal is to make this loop as fast as it can possible be, + // every single piece of overhead should be removed here + // there should be not a single callvirt instruction in the loop + // ldfld only if necessary (e.g. a reference changes) + // no memory writes unless necessary while (true) { if (currStateId == _deadStateId) @@ -783,7 +797,7 @@ private bool FindEndPositionDeltasDFA 0 && TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { endPos = pos; endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index b00f2631c3aa2..d455f26da1dcf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -26,14 +26,14 @@ internal static class SymbolicRegexThresholds /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s /// with an entirely different search-time algorithmic complexity /// 100_000 isn't a really a high memory cost either, - /// i'd even put 1_000_000 on the table but that might push it for general purpose use + /// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has /// internal const int NfaThreshold = 100_000; /// /// Default maximum estimated safe expansion size of a AST /// after the AST has been anlayzed for safe handling. - /// TODO: this is perhaps too conservative, consider raising this + /// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too /// /// If the AST exceeds this threshold then is thrown. /// This default value may be overridden with the AppContext data From 723c5b61e9814a352c39e50edc3b8842da428b2f Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 6 Jun 2024 02:15:14 +0300 Subject: [PATCH 16/63] testing nfa fallback --- .../Symbolic/SymbolicRegexMatcher.cs | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 258c0ab72ae46..15c1ae489c6aa 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -503,11 +503,12 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); } - // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. if (done || pos >= input.Length) @@ -626,15 +627,15 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l // Note: the order here is important so the transition gets taken if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)|| pos >= lengthMinus1) { - pos++; - if (pos < input.Length) + if (pos + 1 < input.Length) { return false; } + pos++; // one off check for the final position // this is just to move it out of the hot loop - if ((!_stateFlagsArray[currStateId].IsNullable() && - !_stateArray[currStateId]!.IsNullableFor( + if (!(_stateFlagsArray[currStateId].IsNullable() || + _stateArray[currStateId]!.IsNullableFor( GetPositionKind(-1)))) { return false; @@ -710,15 +711,15 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // Note: the order here is important so the transition gets taken if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]])|| pos >= lengthMinus1) { - pos++; - if (pos < input.Length) + if (pos + 1 < input.Length) { return false; } + pos++; // one off check for the final position // this is just to move it out of the hot loop - if ((!_stateFlagsArray[currStateId].IsNullable() && - !_stateArray[currStateId]!.IsNullableFor( + if (!(_stateFlagsArray[currStateId].IsNullable() || + _stateArray[currStateId]!.IsNullableFor( GetPositionKind(-1)))) { return false; @@ -1722,7 +1723,7 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche where TStateHandler : struct, IStateHandler { Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); - return matcher.IsNullableWithContext(state.DfaStateId, positionId); + return flags.IsNullable(); } } @@ -1736,6 +1737,8 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche where TStateHandler : struct, IStateHandler { return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + // cannot be used in NFA mode + // return matcher.IsNullableWithContext(state.DfaStateId, positionId); } } } From 6bf4095ad83a9e3c2be7f55c1c55744b56211bc9 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 17 Jun 2024 23:55:54 +0300 Subject: [PATCH 17/63] refactoring, work in progress --- .../src/System.Text.RegularExpressions.csproj | 6 +- .../Symbolic/MatchReversal.cs | 17 + .../Symbolic/MatchReversalKind.cs | 14 + .../Symbolic/MatchingState.cs | 2 - .../Symbolic/MintermClassifier.cs | 31 +- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 37 ++- .../Symbolic/SymbolicRegexMatcher.cs | 312 +++++++++++------- 7 files changed, 269 insertions(+), 150 deletions(-) create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 6fbc17722a774..0d952017013c0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,7 +5,9 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - + IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649; + + $(NoWarn);CS1574 @@ -94,6 +96,8 @@ + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs new file mode 100644 index 0000000000000..b7be92195ee58 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs @@ -0,0 +1,17 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Text.RegularExpressions.Symbolic; + +internal sealed class MatchReversal where TSet : IComparable, IEquatable +{ + public MatchReversal(MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null) + { + Kind = kind; + FixedLength = fixedLength; + AdjustedStartState = adjustedStartState; + } + internal MatchReversalKind Kind { get; } + internal int FixedLength { get; } + internal MatchingState? AdjustedStartState { get; } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs new file mode 100644 index 0000000000000..d498e4dd7eb99 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Text.RegularExpressions.Symbolic; + +internal enum MatchReversalKind +{ + /// The most generic option, run the regex backwards to find beginning of match + MatchStart, + /// Part of the reversal is fixed length and can be skipped + PartialFixedLength, + /// The entire pattern is fixed length, reversal not necessary + FixedLength +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 5d9c66dbb07e0..5bd2baf668d3d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -166,8 +166,6 @@ internal StateFlags BuildStateFlags(bool isInitial) internal byte BuildNullabilityInfo() { byte nullabilityInfo = 0; - // this is significantly cheaper to initialize once - // than to pay for it on every call if (Node.CanBeNullable) { for (uint ck = 0; ck < CharKind.CharKindCount; ck++) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 83cd14daf30d2..d3a0933c18433 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -49,6 +49,7 @@ public MintermClassifier(BDD[] minterms) // ascii-only array to save memory // int mintermId = c >= 128 ? 0 : mtlookup[c]; + // _isAsciiOnly = true; _isAsciiOnly = true; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { @@ -63,20 +64,24 @@ public MintermClassifier(BDD[] minterms) // but it's there as a fallback mechanism if (minterms.Length > 255) { + // WIP: temporary exception to see if any tests in the pipeline reach this + // if nothing reaches this perhaps it'd be easier to just throw an exception + // during construction + throw new Exception($"reached over 255 minterms, count {minterms}"); // over 255 unique sets also means it's never ascii only - int[] lookup = new int[ushort.MaxValue + 1]; - for (int mintermId = 1; mintermId < minterms.Length; mintermId++) - { - // precompute all assigned minterm categories - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - foreach ((uint start, uint end) in mintermRanges) - { - // assign character ranges in bulk - Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - slice.Fill(mintermId); - } - } - _intLookup = lookup; + // int[] lookup = new int[ushort.MaxValue + 1]; + // for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + // { + // // precompute all assigned minterm categories + // (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); + // foreach ((uint start, uint end) in mintermRanges) + // { + // // assign character ranges in bulk + // Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); + // slice.Fill(mintermId); + // } + // } + // _intLookup = lookup; } else { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 60e8712298bfb..1ef89b006fef0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -60,13 +60,13 @@ internal sealed partial class SymbolicRegexMatcher private bool[] _canBeAcceleratedArray; #if DEBUG - // private readonly Action _wout = st => - // { - // var a_cons = System.Reflection.Assembly.Load("System.Console"); - // var t_cons = a_cons.GetType("System.Console")!; - // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); - // wl!.Invoke(null, [st]); - // }; + private readonly Action _wout = st => + { + var a_cons = System.Reflection.Assembly.Load("System.Console"); + var t_cons = a_cons.GetType("System.Console")!; + var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); + wl!.Invoke(null, [st]); + }; #endif /// /// The transition function for DFA mode. @@ -199,7 +199,7 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint /// /// reversed initial pattern /// returns n of chars to skip and adjusted reversal start state - private (int, MatchingState) CreateOptimizedReversal(SymbolicRegexNode node) + private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node) { int pos = 0; SymbolicRegexNode? current = node; @@ -240,10 +240,15 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint while (canLoop) { #if DEBUG - // _wout($"{pos} {current._kind} l:{current._left!._kind} {current}"); + // if (current._left is null) + // _wout($"NULL {current._kind}"); + // else + // _wout($"{pos} {current._kind} l:{current._left!._kind} {current}"); #endif (bool loop, SymbolicRegexNode next) = current switch { + // if this is reached then entire match is fixed length + { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} => (true, current._right!), {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => @@ -257,7 +262,17 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint canLoop = loop; current = next; } - return (pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); + + MatchReversal reversal = + (pos, current) switch + { + { pos: > 0 } when current == _builder.Epsilon => new MatchReversal(MatchReversalKind.FixedLength, pos), + { pos: > 0 } => new MatchReversal(MatchReversalKind.PartialFixedLength, pos, + GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)), + _ => new MatchReversal(MatchReversalKind.MatchStart, 0) + }; + + return reversal; } /// @@ -424,7 +439,7 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse MatchingState coreState = GetState(coreId); TSet minterm = GetMintermFromId(mintermId); uint nextCharKind = GetPositionKind(mintermId); - SymbolicRegexNode? targetNode = coreTargetId > 0 ? + SymbolicRegexNode targetNode = coreTargetId > 0 ? GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind); List targetsList = new(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 15c1ae489c6aa..30fc1be98abdf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -105,7 +105,7 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state. /// - private readonly (int, MatchingState) _optimizedReversalState; + private readonly MatchReversal _optimizedReversalState; /// Partition of the input space of sets. private readonly TSet[] _minterms; @@ -159,11 +159,11 @@ public static SymbolicRegexMatcher Create( // Convert the BDD-based AST to TSet-based AST SymbolicRegexNode rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver)); - return new SymbolicRegexMatcher(builder, rootNode, captureCount, findOptimizations, matchTimeout); + return new SymbolicRegexMatcher(bddBuilder, builder, rootNode, captureCount, findOptimizations, matchTimeout); } /// Constructs matcher for given symbolic regex. - private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) + private SymbolicRegexMatcher(SymbolicRegexBuilder bddBuilder, SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) { Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}"); @@ -200,23 +200,34 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId); } + // Create optimized reversal + _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder)); + // Store the find optimizations that can be used to jump ahead to the next possible starting location. // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's // handling for beginning anchors. if (findOptimizations.IsUseful && findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning) { + // this makes some assumptions about the frequency of occurrences + // some large sets like \p{Sm} are faster with infrequent matches but slower with frequent matches + // the easiest thing to do here is to leave it as-is, but this means some inputs can have large performance losses of 10x or more + var setIsTooCommon = new Func((fds) => { - // _wout($"s{fds.Set}"); - // _wout($"c{fds.Chars.AsSpan()}"); + // _wout($"rn{fds.Range is null}"); + // _wout($"cn{fds.Chars is null}"); + // _wout($"cc{fds.Chars!.Length}"); return fds switch { - // anything above 4 uint16 chars is generally slower than DFA - { Chars: not null } => fds.Chars.Length > 4, + { Chars: not null } => + // anything above 4 uint16 chars is generally slower than DFA + fds.Negated || + (fds.Chars.Length > 4 && + Array.Exists(fds.Chars, char.IsAsciiLetterLower)), { Range: not null } => false, - { Set: not null } => true, - _ => false + _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength, + // false }; }); // a DFA is sometimes 10x-100x faster than the optimizations @@ -230,7 +241,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations, _ => findOptimizations // TODO: unsure which options are left here }; + // _findOpts = findOptimizations; + // _findOpts = null; // _wout($"{findOptimizations.FindMode}"); + // _wout($"{findOptimizations.FixedDistanceSets![0]}"); // _wout($"o{_findOpts}"); } @@ -282,8 +296,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo } _reverseInitialStates = reverseInitialStates; - // Create optimized reversal - _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder)); // Maps a minterm ID to a character kind uint CalculateMintermIdKind(int mintermId) @@ -397,17 +409,18 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. - int matchStartLowBoundary, matchStartLengthMarker; + // int matchStartLowBoundary, matchStartLengthMarker; + int matchStartLowBoundary; int matchEnd = (_pattern._info.ContainsEndZAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch { - (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), + (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), + (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), }; // If there wasn't a match, we're done. @@ -430,21 +443,61 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from // that last b until it finds the 4th a: aaabbbc. int matchStart; - if (matchStartLengthMarker >= 0) + Debug.Assert(matchEnd >= startat - 1); + switch (_optimizedReversalState.Kind) { - matchStart = matchEnd - matchStartLengthMarker; - } - else - { - Debug.Assert(matchEnd >= startat - 1); - matchStart = matchEnd < startat ? - startat : (_pattern._info.ContainsEndZAnchor, _pattern._info.ContainsSomeAnchor) switch + case MatchReversalKind.FixedLength: + matchStart = (matchEnd - _optimizedReversalState.FixedLength); + break; + case MatchReversalKind.MatchStart: + case MatchReversalKind.PartialFixedLength: + int initialLastStart = -1; // invalid sentinel value + int i = matchEnd; + uint charKind2 = GetCharKind(input, matchEnd); + CurrentState reversalStartState; + + // _containsAnyAnchor + if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength) { - (true, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), - (true, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), - (false, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), - (false, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + i -= _optimizedReversalState.FixedLength; + reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!); + // reversal may already be nullable here in the case of anchors + if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0) + { + if (FullNullabilityHandler.IsNullableAt(this, + in reversalStartState, FullInputReader.GetPositionId(this, input, i), + DfaStateHandler.GetStateFlags(this, in reversalStartState))) + { + initialLastStart = i; + } + } + } + else + { + reversalStartState = new CurrentState(_reverseInitialStates[charKind2]); + } + uint charKind = GetCharKind(input, matchEnd); + var startState = new CurrentState(_reverseInitialStates[charKind]); + matchStart = matchEnd < startat + ? startat + : (_containsEndZAnchor, _containsAnyAnchor) switch + { + (true, true) => + FindStartPosition( + startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + (true, false) => + FindStartPosition( + startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + (false, true) => + FindStartPosition( + startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + (false, false) => + FindStartPosition( + startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), }; + break; + default: + throw new ArgumentOutOfRangeException(); } // Phase 3: @@ -471,12 +524,11 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// The time at which timeout occurs, if timeouts are being checked. /// The mode of execution based on the regex operation being performed. /// The last position the initial state of was visited before the end position was found. - /// Length of the match if there's a match; otherwise, -1. /// Per thread data reused between calls. /// /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData) + private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, PerThreadData perThreadData) where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler @@ -487,7 +539,6 @@ private int FindEndPosition(input, pos - 1)]); int endPos = NoMatchExists; - int endStateId = -1; while (true) { @@ -503,48 +554,34 @@ private int FindEndPosition(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePos, ref initialStatePosCandidate); - else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null) + TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate); + // else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null) + else if (_findOpts is null && !_containsEndZAnchor) { done = - _mintermClassifier.IsAsciiOnly() - ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1, - mode, ref pos, - currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, - ref initialStatePosCandidate) + // _mintermClassifier.IsAsciiOnly() + // ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1, + // mode, ref pos, + // currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, + // ref initialStatePosCandidate) // if there are no edge cases then use the quicker loop - : - FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, - currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, + // : + FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, + currentState.DfaStateId, ref endPos, ref initialStatePos, ref initialStatePosCandidate); } else { // dfa loop with potential skipping done = FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + TNullabilityHandler>(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -570,13 +607,9 @@ private int FindEndPosition 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; + + } return endPos; } @@ -586,14 +619,13 @@ private int FindEndPosition= 128 ? 0 : mtlookup[c]`; /// private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, - ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; // can only be used with full array initialized and <= 255 minterms byte[] mtlookup = _mintermClassifier.ByteLookup()!; - int endStateId = endStateIdRef; int currStateId = startStateId; try { @@ -614,7 +646,6 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l { { endPos = pos; - endStateId = currStateId; // A match is known to exist. If that's all we need to know, we're done. if (mode == RegexRunnerMode.ExistenceRequired) { @@ -642,7 +673,6 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l } // the end position (-1) was nullable endPos = pos; - endStateId = currStateId; return mode == RegexRunnerMode.ExistenceRequired; } @@ -655,8 +685,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l // Write back the local copies of the ref values. posRef = pos; endPosRef = endPos; - endStateIdRef = endStateId; - initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; + initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; } } @@ -666,39 +695,61 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l /// ~50% difference in performance with removing unnecessary checks alone /// /// - private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, - ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, + ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TFindOptimizationsHandler : struct, IInitialStateHandler + where TNullabilityHandler : struct, INullabilityHandler { + // initial check for input end to get it out of the loop + if (posRef == input.Length) + { + if (!(_stateFlagsArray[startStateId].IsNullable() || + _stateArray[startStateId]!.IsNullableFor( + GetPositionKind(-1)))) + { + return true; + } + + // the end position (-1) was nullable + endPosRef = posRef; + return true; + } + + // Debug.Assert(posRef < input.Length, $"input end condition should be handled outside the loop"); // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; // can only be used with full array initialized and <= 255 minterms byte[] mtlookup = _mintermClassifier.ByteLookup()!; - int endStateId = endStateIdRef; int currStateId = startStateId; // ldfld only once - // int deadStateId = _deadStateId; + int deadStateId = _deadStateId; try { // Loop through each character in the input, transitioning from state to state for each. - // The goal is to make this loop as fast as it can possible be, + // The goal is to make this loop as fast as it can possibly be, // every single piece of overhead should be removed here // there should be not a single callvirt instruction in the loop // ldfld only if necessary (e.g. a reference changes) // no memory writes unless necessary while (true) { - if (currStateId == _deadStateId) + if (currStateId == deadStateId) { return true; } + + // acceleratedstatehandler + // If the state is nullable for the next character, we found a potential end state. // note: the double array lookup is important here, storing a local variable is expensive if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]])) { { endPos = pos; - endStateId = currStateId; // A match is known to exist. If that's all we need to know, we're done. if (mode == RegexRunnerMode.ExistenceRequired) { @@ -726,7 +777,6 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length } // the end position (-1) was nullable endPos = pos; - endStateId = currStateId; return mode == RegexRunnerMode.ExistenceRequired; } @@ -739,8 +789,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length // Write back the local copies of the ref values. posRef = pos; endPosRef = endPos; - endStateIdRef = endStateId; - initialStatePosRef = endStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; + initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; } } @@ -764,7 +813,7 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int length /// A negative value if iteration completed because we ran out of input or we failed to transition. /// private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler @@ -773,18 +822,19 @@ private bool FindEndPositionDeltasDFA(this, input, ref state, ref pos)) @@ -798,10 +848,11 @@ private bool FindEndPositionDeltasDFA(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, + positionId, TStateHandler.GetStateFlags(this, in state))) { endPos = pos; - endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); + // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. @@ -812,7 +863,8 @@ private bool FindEndPositionDeltasDFA= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, + positionId)) { return false; } @@ -826,7 +878,7 @@ private bool FindEndPositionDeltasDFA private bool FindEndPositionDeltasNFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler @@ -860,7 +912,6 @@ private bool FindEndPositionDeltasNFA(this, in state, positionId, flags)) { endPos = pos; - endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. @@ -911,7 +961,6 @@ private bool FindEndPositionDeltasNFA. /// + /// State to start reversal from + /// Either valid match start location or -1 /// The input text. /// The ending position to walk backwards from. points one past the last character of the match. /// The initial starting location discovered in phase 1, a point we must not walk earlier than. /// Per thread data reused between calls. /// The found starting position for the match. - private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) + private int FindStartPosition(CurrentState startState, int initialLastStart, ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { Debug.Assert(i >= 0, $"{nameof(i)} == {i}"); Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}"); Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}."); - - // Get the starting state for the reverse pattern. This depends on previous character (which, because we're - // going backwards, is character number i). - CurrentState currentState; - int lastStart = -1; // invalid sentinel value - // if possible use optimized reversal instead - if (_optimizedReversalState.Item1 > 0) - { - i -= _optimizedReversalState.Item1; - currentState = new CurrentState(_optimizedReversalState.Item2); - // anchor variant may need context to be computed if nullable - if (_containsAnyAnchor && _nullabilityArray[currentState.DfaStateId] > 0) - { - if (TNullabilityHandler.IsNullableAt(this, - in currentState, TInputReader.GetPositionId(this, input, i), - DfaStateHandler.GetStateFlags(this, in currentState))) - { - lastStart = i; - } - } - } - else - { - currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); - } + CurrentState currentState = startState; + int lastStart = initialLastStart; // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary. while (true) @@ -1634,6 +1662,12 @@ private interface IInputReader public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); } + private readonly struct OptimizedAsciiInputReader : IInputReader + { + public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => + matcher._mintermClassifier.GetMintermID(input[pos]); + } + /// This reader omits the special handling of \n for the \Z anchor. private readonly struct NoZAnchorInputReader : IInputReader { @@ -1658,15 +1692,47 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan } } + + private interface IInitialStateHandler + { + public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader; + } + /// /// Interface for optimizations to accelerate search from initial states. /// - private interface IInitialStateHandler + private interface IAcceleratedStateHandler { - public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static abstract void TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) where TInputReader : struct, IInputReader; } + private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader + { + // Find the first position that matches with some likely character. + if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + { + // No match exists + state = new CurrentState(matcher.GetState(matcher._deadStateId)); + pos = input.Length; + return; + } + + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // As with the initial starting state, if it's a dead end, no match exists. + state = new CurrentState( + matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + return; + } + } + /// /// No-op handler for when there are no initial state optimizations to apply. /// From b10e600dfab71d795f884f00af6ac3f79127d04e Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 18 Jun 2024 19:46:40 +0300 Subject: [PATCH 18/63] refactoring to struct interfaces --- .../src/System.Text.RegularExpressions.csproj | 6 +- .../Symbolic/MintermClassifier.cs | 13 +- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 14 +- .../Symbolic/SymbolicRegexMatcher.cs | 459 ++++++++++++------ 4 files changed, 330 insertions(+), 162 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 0d952017013c0..b3fda5f2f4326 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,9 +5,9 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649; - - $(NoWarn);CS1574 + + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index d3a0933c18433..96cd72c4c0c95 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -27,11 +27,11 @@ internal sealed class MintermClassifier /// Conserve memory if pattern is ascii-only private readonly bool _isAsciiOnly; - /// - /// fallback lookup if over 255 minterms - /// this is almost never used - /// - private readonly int[]? _intLookup; + // /// + // /// fallback lookup if over 255 minterms + // /// this is almost never used + // /// + // private readonly int[]? _intLookup; /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. @@ -111,7 +111,8 @@ public int GetMintermID(int c) } // high performance variant would use a span directly. // additional memory is saved by using a byte - return _intLookup is null ? _lookup![c] : _intLookup[c]; + return _lookup![c]; + // return _intLookup is null ? _lookup![c] : _intLookup[c]; } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 1ef89b006fef0..eac516c43bcdd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -60,13 +60,13 @@ internal sealed partial class SymbolicRegexMatcher private bool[] _canBeAcceleratedArray; #if DEBUG - private readonly Action _wout = st => - { - var a_cons = System.Reflection.Assembly.Load("System.Console"); - var t_cons = a_cons.GetType("System.Console")!; - var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); - wl!.Invoke(null, [st]); - }; + // private readonly Action _wout = st => + // { + // var a_cons = System.Reflection.Assembly.Load("System.Console"); + // var t_cons = a_cons.GetType("System.Console")!; + // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); + // wl!.Invoke(null, [st]); + // }; #endif /// /// The transition function for DFA mode. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 30fc1be98abdf..3f0543772b52a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -163,7 +163,7 @@ public static SymbolicRegexMatcher Create( } /// Constructs matcher for given symbolic regex. - private SymbolicRegexMatcher(SymbolicRegexBuilder bddBuilder, SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) + private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) { Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}"); @@ -226,8 +226,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder bddBuilder, SymbolicRegex (fds.Chars.Length > 4 && Array.Exists(fds.Chars, char.IsAsciiLetterLower)), { Range: not null } => false, + // for fixed length strings just trust the optimizations _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength, - // false }; }); // a DFA is sometimes 10x-100x faster than the optimizations @@ -408,20 +408,53 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // It returns NoMatchExists (-2) when there is no match. // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after - // the c as the low boundary for the starting position. - // int matchStartLowBoundary, matchStartLengthMarker; - int matchStartLowBoundary; - int matchEnd = (_pattern._info.ContainsEndZAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch + // the c as the low boundary for the starting position.d + // int matchStartLowBoundary = startat; + int matchEnd; + if (!_containsEndZAnchor) { - (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, perThreadData), - }; + bool isAsciiOnly = _mintermClassifier.IsAsciiOnly(); + matchEnd = (isAsciiOnly, _findOpts is not null, _containsAnyAnchor) switch + { + (true, true, true) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, true, false) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false, false) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false, true) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true, false) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true, true) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false, true) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false, false) => + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + }; + } + else + { + // fallback for EndZ anchor + matchEnd = (_findOpts is not null) switch + { + true => + FindEndPositionFallback( + input, startat, timeoutOccursAt, mode, perThreadData), + false => + FindEndPositionFallback( + input, startat, timeoutOccursAt, mode, perThreadData), + }; + } // If there wasn't a match, we're done. if (matchEnd == NoMatchExists) @@ -453,10 +486,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i case MatchReversalKind.PartialFixedLength: int initialLastStart = -1; // invalid sentinel value int i = matchEnd; - uint charKind2 = GetCharKind(input, matchEnd); CurrentState reversalStartState; - - // _containsAnyAnchor if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength) { i -= _optimizedReversalState.FixedLength; @@ -474,26 +504,25 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - reversalStartState = new CurrentState(_reverseInitialStates[charKind2]); + reversalStartState = new CurrentState(_reverseInitialStates[ + GetCharKind(input, matchEnd)]); } - uint charKind = GetCharKind(input, matchEnd); - var startState = new CurrentState(_reverseInitialStates[charKind]); matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch { (true, true) => FindStartPosition( - startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + reversalStartState, initialLastStart, input, i, startat, perThreadData), (true, false) => FindStartPosition( - startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + reversalStartState, initialLastStart, input, i, startat, perThreadData), (false, true) => FindStartPosition( - startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + reversalStartState, initialLastStart, input, i, startat, perThreadData), (false, false) => FindStartPosition( - startState, initialLastStart, input, matchEnd, matchStartLowBoundary, perThreadData), + reversalStartState, initialLastStart, input, i, startat, perThreadData), }; break; default: @@ -511,34 +540,94 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - Registers endRegisters = _pattern._info.ContainsLineAnchor ? + Registers endRegisters = _containsAnyAnchor ? FindSubcaptures(input, matchStart, matchEnd, perThreadData) : FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } + private int FindEndPositionOptimized(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) + where TOptimizedInputReader : struct, IOptimizedInputReader + where TAcceleratedStateHandler : struct, IAcceleratedStateHandler + where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler + { + int initialStatePosCandidate = pos; + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); + int endPos = NoMatchExists; + + while (true) + { + const int CharsPerTimeoutCheck = 1_000; + // TODO: maybe this should be for NFA mode only + int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ? + pos + CharsPerTimeoutCheck : + input.Length; + + bool done; + if (currentState.NfaState is null) + done = + FindEndPositionDeltasDFAOptimized(input, innerLoopLength - 1, mode, ref pos, + currentState.DfaStateId, ref endPos, ref initialStatePosCandidate, + ref initialStatePosCandidate); + else + // nfa fallback check + // assume \Z and full nullability for nfa since it's already extremely rare to get here + done = + FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, + ref initialStatePosCandidate, ref initialStatePosCandidate); + + // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or + // there is no more input available, then the whole search is done. + if (done || pos >= input.Length) + { + break; + } + + // The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and + // need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more + // input available. + if (pos < innerLoopLength) + { + // Because there was still more input available, a failure to transition in DFA mode must be the cause + // of the early exit. Upgrade to NFA mode. + NfaMatchingState nfaState = perThreadData.NfaState; + nfaState.InitializeFrom(this, GetState(currentState.DfaStateId)); + currentState = new CurrentState(nfaState); + } + + // Check for a timeout before continuing. + if (_checkTimeout) + { + CheckTimeout(timeoutOccursAt); + } + } + return endPos; + } + /// Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call. /// The input text. /// The starting position in . /// The time at which timeout occurs, if timeouts are being checked. /// The mode of execution based on the regex operation being performed. - /// The last position the initial state of was visited before the end position was found. /// Per thread data reused between calls. /// /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, PerThreadData perThreadData) + private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { - initialStatePos = pos; int initialStatePosCandidate = pos; var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); int endPos = NoMatchExists; + int endStateId = -1; while (true) { @@ -549,40 +638,14 @@ private int FindEndPosition CharsPerTimeoutCheck ? pos + CharsPerTimeoutCheck : input.Length; + bool done = currentState.NfaState is not null ? + FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate) : + FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); - - bool done; - if (currentState.NfaState is not null) - // nfa fallback check - done = FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate); - // else if (_findOpts is null && !_containsEndZAnchor && _mintermClassifier.ByteLookup() is not null) - else if (_findOpts is null && !_containsEndZAnchor) - { - done = - // _mintermClassifier.IsAsciiOnly() - // ? FindEndPositionDeltasDFANoSkipAscii(input, innerLoopLength - 1, - // mode, ref pos, - // currentState.DfaStateId, ref endPos, ref endStateId, ref initialStatePos, - // ref initialStatePosCandidate) - // if there are no edge cases then use the quicker loop - // : - FindEndPositionDeltasDFANoSkip(input, innerLoopLength - 1, mode, ref pos, - currentState.DfaStateId, ref endPos, ref initialStatePos, - ref initialStatePosCandidate); - } - else - { - // dfa loop with potential skipping - done = FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePos, ref initialStatePosCandidate); - } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. if (done || pos >= input.Length) @@ -607,73 +670,90 @@ private int FindEndPosition 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; return endPos; } /// - /// Ascii-only variant of the hot loop to conserve memory. - /// Only major difference is the minterm lookup: - /// `int positionId = c >= 128 ? 0 : mtlookup[c]`; + /// Workhorse inner loop for . Consumes the character by character, + /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, + /// lazily building out the graph as needed. /// - private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, - ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + /// + /// The supplies the actual transitioning logic, controlling whether processing is + /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, + /// so for example if is a , it expects the 's + /// to be non-negative and its to be null; vice versa for + /// . + /// + /// + /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. + /// 0 if iteration completed because we reached an initial state. + /// A negative value if iteration completed because we ran out of input or we failed to transition. + /// + private bool FindEndPositionDeltas(ReadOnlySpan input, int length, RegexRunnerMode mode, + ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TFindOptimizationsHandler : struct, IInitialStateHandler + where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; - // can only be used with full array initialized and <= 255 minterms - byte[] mtlookup = _mintermClassifier.ByteLookup()!; - int currStateId = startStateId; + // int endStateId = endStateIdRef; + int initialStatePos = initialStatePosRef; + int initialStatePosCandidate = initialStatePosCandidateRef; try { // Loop through each character in the input, transitioning from state to state for each. while (true) { - if (currStateId == _deadStateId) + StateFlags flags = TStateHandler.GetStateFlags(this, in state); + + // Check if currentState represents an initial state. If it does, call into any possible find optimizations + // to hopefully more quickly find the next possible starting location. + if (flags.IsInitial()) + { + if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) + { + return true; + } + + initialStatePosCandidate = pos; + } + + // If the state is a dead end, such that we can't transition anywhere else, end the search. + if (state.DfaStateId == _deadStateId) { return true; } - int c = input[pos]; - int positionId = c >= 128 ? 0 : mtlookup[c]; + int positionId = TInputReader.GetPositionId(this, input, pos); - // If the state is nullable for the next character we found a potential end state. - // note: the double array lookup is important here, storing a local variable is expensive - if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, positionId)) + // If the state is nullable for the next character, meaning it accepts the empty string, + // we found a potential end state. + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) { + endPos = pos; + // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); + initialStatePos = initialStatePosCandidate; + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) { - endPos = pos; - // A match is known to exist. If that's all we need to know, we're done. - if (mode == RegexRunnerMode.ExistenceRequired) - { - return true; - } + return true; } } // If there is more input available try to transition with the next character. - // Note: the order here is important so the transition gets taken - if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, positionId)|| pos >= lengthMinus1) + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) { - if (pos + 1 < input.Length) - { - return false; - } - pos++; - // one off check for the final position - // this is just to move it out of the hot loop - if (!(_stateFlagsArray[currStateId].IsNullable() || - _stateArray[currStateId]!.IsNullableFor( - GetPositionKind(-1)))) - { - return false; - } - // the end position (-1) was nullable - endPos = pos; - return mode == RegexRunnerMode.ExistenceRequired; + return false; } // We successfully transitioned, so update our current input index to match. @@ -685,56 +765,47 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l // Write back the local copies of the ref values. posRef = pos; endPosRef = endPos; - initialStatePosRef = currStateId > 0 ? initialStatePosCandidateRef : initialStatePosRef; + // endStateIdRef = endStateId; + initialStatePosRef = initialStatePos; + initialStatePosCandidateRef = initialStatePosCandidate; } } + /// - /// TODO: this is essentially a stripped down version when there's no good prefix optimizations - /// i don't trust the compiler to optimize this and it makes a - /// ~50% difference in performance with removing unnecessary checks alone - /// + /// tbd /// - private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) - where TStateHandler : struct, IStateHandler - where TInputReader : struct, IInputReader - where TFindOptimizationsHandler : struct, IInitialStateHandler - where TNullabilityHandler : struct, INullabilityHandler + where TOptimizedInputReader : struct, IOptimizedInputReader + where TAcceleratedStateHandler : struct, IAcceleratedStateHandler + where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler { // initial check for input end to get it out of the loop if (posRef == input.Length) { if (!(_stateFlagsArray[startStateId].IsNullable() || - _stateArray[startStateId]!.IsNullableFor( - GetPositionKind(-1)))) + _stateArray[startStateId]!.IsNullableFor(GetPositionKind(-1)))) { return true; } - - // the end position (-1) was nullable + // the end position kind (-1) was nullable endPosRef = posRef; return true; } - // Debug.Assert(posRef < input.Length, $"input end condition should be handled outside the loop"); // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; - // can only be used with full array initialized and <= 255 minterms byte[] mtlookup = _mintermClassifier.ByteLookup()!; int currStateId = startStateId; - // ldfld only once int deadStateId = _deadStateId; + int initialStateId = _dotstarredInitialStates[CharKind.General].Id; try { - // Loop through each character in the input, transitioning from state to state for each. // The goal is to make this loop as fast as it can possibly be, // every single piece of overhead should be removed here - // there should be not a single callvirt instruction in the loop - // ldfld only if necessary (e.g. a reference changes) - // no memory writes unless necessary while (true) { if (currStateId == deadStateId) @@ -742,11 +813,18 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l return true; } - // acceleratedstatehandler + if (TAcceleratedStateHandler.TryFindNextStartingPosition( + this, mtlookup, input, ref currStateId, ref pos, initialStateId)) + { + // future work could combine this with an immediate state transition + // but this requires changing too much for now + continue; + } // If the state is nullable for the next character, we found a potential end state. // note: the double array lookup is important here, storing a local variable is expensive - if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]])) + // if (_nullabilityArray[currStateId] > 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]])) + if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray, currStateId, mtlookup, input, pos)) { { endPos = pos; @@ -760,7 +838,9 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l // If there is more input available try to transition with the next character. // Note: the order here is important so the transition gets taken - if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, mtlookup[input[pos]])|| pos >= lengthMinus1) + if (!DfaStateHandler.TryTakeDFATransition( + this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, input, pos)) + || pos >= lengthMinus1) { if (pos + 1 < input.Length) { @@ -796,7 +876,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan input, int l /// /// TODO: this is a separate DFA function that takes advantage of short circuit array lookups - /// Workhorse inner loop for . Consumes the character by character, + /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// @@ -886,7 +966,7 @@ private bool FindEndPositionDeltasDFA /// TODO: this is the fallback NFA function - /// Workhorse inner loop for . Consumes the character by character, + /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// @@ -921,9 +1001,6 @@ private bool FindEndPositionDeltasNFA input, + int pos); + } + + private readonly struct OptimizedAsciiInputReader : IOptimizedInputReader + { + public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos < input.Length); + return input[pos] >= 128 ? 0 : lookup[input[pos]]; + } + } + + private readonly struct OptimizedUnicodeInputReader : IOptimizedInputReader + { + public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos < input.Length); + Debug.Assert(lookup.Length == (ushort.MaxValue + 1)); + return lookup[input[pos]]; + } + } + + private interface IOptimizedNullabilityHandler + { + public static abstract bool IsNullable(SymbolicRegexMatcher matcher, + byte[] nullabilityArray, int + currStateId, byte[] lookup, ReadOnlySpan input, int pos) + where TOptimizedInputReader : struct, IOptimizedInputReader; + } + + private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + { + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + where TOptimizedInputReader : struct, IOptimizedInputReader + { + return nullabilityArray[currStateId] > 0; + } + } + + private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + { + public static bool IsNullable(SymbolicRegexMatcher matcher, + byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + where TOptimizedInputReader : struct, IOptimizedInputReader + { + return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, input, pos)); + } + } + /// /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to /// both take transitions and decide nullability. For positions of valid characters that are handled normally, @@ -1662,11 +1791,7 @@ private interface IInputReader public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); } - private readonly struct OptimizedAsciiInputReader : IInputReader - { - public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => - matcher._mintermClassifier.GetMintermID(input[pos]); - } + /// This reader omits the special handling of \n for the \Z anchor. private readonly struct NoZAnchorInputReader : IInputReader @@ -1701,35 +1826,77 @@ public static abstract bool TryFindNextStartingPosition(SymbolicRe } /// - /// Interface for optimizations to accelerate search from initial states. + /// Interface for accelerated states, returns true if position was changed /// private interface IAcceleratedStateHandler { - public static abstract void TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader; + public static abstract bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, ref + int currentStateId, ref int pos, int initialStateId) + where TOptimizedInputReader : struct, IOptimizedInputReader; } + private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + byte[] lookup, + ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) + where TOptimizedInputReader : struct, IOptimizedInputReader + + { + // if (!matcher._canBeAcceleratedArray[currentStateId]) + if (currentStateId != initialStateId) + return false; + // Find the first position that matches with some likely character. + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + { + return false; + } + + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; + return true; + } + } private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + byte[] lookup, + ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) + where TOptimizedInputReader : struct, IOptimizedInputReader + { + // if (!matcher._canBeAcceleratedArray[currentStateId]) + if (currentStateId != initialStateId) + return false; + // int tempPos = pos; // Find the first position that matches with some likely character. if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { // No match exists - state = new CurrentState(matcher.GetState(matcher._deadStateId)); + currentStateId = matcher._deadStateId; pos = input.Length; - return; + return true; } + currentStateId = matcher._dotstarredInitialStates[ + matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1] + ].Id; + return false; + } + } - // Update the starting state based on where TryFindNextStartingPosition moved us to. - // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState( - matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); - return; + private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + byte[] lookup, + ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) + where TOptimizedInputReader : struct, IOptimizedInputReader + { + return false; } } From d68bd3c4eb01b414147059b4aed230dde29367f8 Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 18 Jun 2024 21:06:32 +0300 Subject: [PATCH 19/63] refactoring optimizations --- .../Symbolic/MatchReversal.cs | 18 +++-- .../Symbolic/MintermClassifier.cs | 5 +- .../Symbolic/SymbolicRegexMatcher.cs | 66 +++++++++---------- 3 files changed, 42 insertions(+), 47 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs index b7be92195ee58..215aa65a1d14f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs @@ -3,15 +3,13 @@ namespace System.Text.RegularExpressions.Symbolic; -internal sealed class MatchReversal where TSet : IComparable, IEquatable +internal sealed class MatchReversal( + MatchReversalKind kind, + int fixedLength, + MatchingState? adjustedStartState = null) + where TSet : IComparable, IEquatable { - public MatchReversal(MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null) - { - Kind = kind; - FixedLength = fixedLength; - AdjustedStartState = adjustedStartState; - } - internal MatchReversalKind Kind { get; } - internal int FixedLength { get; } - internal MatchingState? AdjustedStartState { get; } + internal MatchReversalKind Kind { get; } = kind; + internal int FixedLength { get; } = fixedLength; + internal MatchingState? AdjustedStartState { get; } = adjustedStartState; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 96cd72c4c0c95..e6352395c670d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -48,13 +48,10 @@ public MintermClassifier(BDD[] minterms) } // ascii-only array to save memory - // int mintermId = c >= 128 ? 0 : mtlookup[c]; - // _isAsciiOnly = true; _isAsciiOnly = true; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - if (mintermRanges[^1].Item2 >= 128) + if (BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2 >= 128) { _isAsciiOnly = false; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 3f0543772b52a..0f0e5b254c113 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -84,6 +84,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Dead end state to quickly return NoMatch, this could potentially be a constant private readonly int _deadStateId; + /// Initial state used to for vectorization + private readonly int _initialStateId; + /// Whether the pattern contains any anchor private readonly bool _containsAnyAnchor; @@ -255,8 +258,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder _, SymbolicRegexBuilder(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, @@ -785,13 +790,11 @@ private bool FindEndPositionDeltas 0 && IsNullableWithContext(currStateId, mtlookup[input[pos]])) if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray, currStateId, mtlookup, input, pos)) { + endPos = pos; + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) { - endPos = pos; - // A match is known to exist. If that's all we need to know, we're done. - if (mode == RegexRunnerMode.ExistenceRequired) - { - return true; - } + return true; } } @@ -850,8 +849,8 @@ private bool FindEndPositionDeltas input private readonly struct OptimizedAsciiInputReader : IOptimizedInputReader { + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos) { Debug.Assert(pos < input.Length); @@ -1744,6 +1743,7 @@ public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos private readonly struct OptimizedUnicodeInputReader : IOptimizedInputReader { + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos) { Debug.Assert(pos < input.Length); @@ -1762,6 +1762,7 @@ public static abstract bool IsNullable(SymbolicRegexMatch private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader { @@ -1771,6 +1772,7 @@ public static bool IsNullable(SymbolicRegexMatcher private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader @@ -1848,7 +1850,6 @@ public static bool TryFindNextStartingPosition(SymbolicRe // if (!matcher._canBeAcceleratedArray[currentStateId]) if (currentStateId != initialStateId) return false; - // Find the first position that matches with some likely character. if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { return false; @@ -1872,19 +1873,18 @@ public static bool TryFindNextStartingPosition(SymbolicRe // if (!matcher._canBeAcceleratedArray[currentStateId]) if (currentStateId != initialStateId) return false; - // int tempPos = pos; - // Find the first position that matches with some likely character. - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; - return true; - } - currentStateId = matcher._dotstarredInitialStates[ - matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1] + currentStateId = matcher._dotstarredInitialStates[ + matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1] ].Id; - return false; + return false; + } + + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; + return true; } } From 153dfc30ea0bb6e96a19e3e4243834fe0f2b60df Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 19 Jun 2024 02:29:56 +0300 Subject: [PATCH 20/63] fallback mode and bugfix --- .../src/System.Text.RegularExpressions.csproj | 6 +-- .../Symbolic/MintermClassifier.cs | 46 ++++++++++--------- .../Symbolic/SymbolicRegexMatcher.cs | 41 ++++++++++------- .../FunctionalTests/NonBacktrackingTests.cs | 22 --------- ...ystem.Text.RegularExpressions.Tests.csproj | 1 - 5 files changed, 51 insertions(+), 65 deletions(-) delete mode 100644 src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index b3fda5f2f4326..0d952017013c0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,9 +5,9 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - - - + IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649; + + $(NoWarn);CS1574 diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index e6352395c670d..96ab22ce8f967 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -31,7 +31,7 @@ internal sealed class MintermClassifier // /// fallback lookup if over 255 minterms // /// this is almost never used // /// - // private readonly int[]? _intLookup; + private readonly int[]? _intLookup; /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. @@ -57,28 +57,24 @@ public MintermClassifier(BDD[] minterms) } } - // i have never seen a regex use over 80 minterms not to speak of 255, - // but it's there as a fallback mechanism + // It's incredibly rare for a regex to use more than a hundred or two minterms, + // but we need a fallback just in case. if (minterms.Length > 255) { - // WIP: temporary exception to see if any tests in the pipeline reach this - // if nothing reaches this perhaps it'd be easier to just throw an exception - // during construction - throw new Exception($"reached over 255 minterms, count {minterms}"); // over 255 unique sets also means it's never ascii only - // int[] lookup = new int[ushort.MaxValue + 1]; - // for (int mintermId = 1; mintermId < minterms.Length; mintermId++) - // { - // // precompute all assigned minterm categories - // (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - // foreach ((uint start, uint end) in mintermRanges) - // { - // // assign character ranges in bulk - // Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - // slice.Fill(mintermId); - // } - // } - // _intLookup = lookup; + int[] lookup = new int[ushort.MaxValue + 1]; + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + { + // precompute all assigned minterm categories + (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); + foreach ((uint start, uint end) in mintermRanges) + { + // assign character ranges in bulk + Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); + slice.Fill(mintermId); + } + } + _intLookup = lookup; } else { @@ -108,8 +104,7 @@ public int GetMintermID(int c) } // high performance variant would use a span directly. // additional memory is saved by using a byte - return _lookup![c]; - // return _intLookup is null ? _lookup![c] : _intLookup[c]; + return _intLookup is null ? _lookup![c] : _intLookup[c]; } /// @@ -126,5 +121,12 @@ public int GetMintermID(int c) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public byte[]? ByteLookup() => _lookup; + + /// + /// Int lookup for rare cases + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int[]? IntLookup() => _intLookup; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 0f0e5b254c113..76614272556f6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -415,12 +415,11 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position.d - // int matchStartLowBoundary = startat; int matchEnd; - if (!_containsEndZAnchor) + // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases + if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null)) { - bool isAsciiOnly = _mintermClassifier.IsAsciiOnly(); - matchEnd = (isAsciiOnly, _findOpts is not null, _containsAnyAnchor) switch + matchEnd = (_mintermClassifier.IsAsciiOnly(), _findOpts is not null, _containsAnyAnchor) switch { (true, true, true) => FindEndPositionOptimized input, i (false, true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false, true) => + (false, false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false, false) => + (false, false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else { - // fallback for EndZ anchor + // fallback for Z anchor or over 255 minterms matchEnd = (_findOpts is not null) switch { true => @@ -584,7 +583,6 @@ private int FindEndPositionOptimized(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); - // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. if (done || pos >= input.Length) @@ -789,6 +787,7 @@ private bool FindEndPositionDeltas(this, input, ref state, ref pos)) { @@ -1777,6 +1785,7 @@ public static bool IsNullable(SymbolicRegexMatcher byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader { + Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}"); return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, input, pos)); } } @@ -1847,12 +1856,11 @@ public static bool TryFindNextStartingPosition(SymbolicRe where TOptimizedInputReader : struct, IOptimizedInputReader { - // if (!matcher._canBeAcceleratedArray[currentStateId]) if (currentStateId != initialStateId) return false; if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - return false; + return true; } // No match exists @@ -1870,7 +1878,6 @@ public static bool TryFindNextStartingPosition(SymbolicRe where TOptimizedInputReader : struct, IOptimizedInputReader { - // if (!matcher._canBeAcceleratedArray[currentStateId]) if (currentStateId != initialStateId) return false; if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) @@ -1878,7 +1885,7 @@ public static bool TryFindNextStartingPosition(SymbolicRe currentStateId = matcher._dotstarredInitialStates[ matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1] ].Id; - return false; + return true; } // No match exists diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs deleted file mode 100644 index 501df78391690..0000000000000 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/NonBacktrackingTests.cs +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Collections; -using System.Collections.Generic; -using Xunit; - -namespace System.Text.RegularExpressions.Tests -{ - /// - /// TODO: Create tests here later - /// - public static partial class NonBacktrackingTests - { - - // [Fact] - // public static void Test() - // { - // } - - } -} diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj index afdd6f1e51f24..dbab47f63d097 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj @@ -19,7 +19,6 @@ - From 4aebe3e77138ec78d34c5be309100dfa0d35b17b Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 19 Jun 2024 02:30:47 +0300 Subject: [PATCH 21/63] reenable warnings --- .../src/System.Text.RegularExpressions.csproj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 0d952017013c0..b3fda5f2f4326 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,9 +5,9 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - IL2026;IL2075;IDE0059;CA1823;CS0162;IDE0060;CS0649; - - $(NoWarn);CS1574 + + + From 1e6f55cb102c312c44001251ad5f2db4fd87bfa7 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 19 Jun 2024 04:00:32 +0300 Subject: [PATCH 22/63] anchor edge case --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index eac516c43bcdd..10da6f1af060f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -204,6 +204,12 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node int pos = 0; SymbolicRegexNode? current = node; bool canLoop = true; + // finding anchors inside pattern invalidates this optimization + var bail = new Func, (bool, SymbolicRegexNode)>(concatNode => + { + pos = 0; + return (false, node); + }); var addSingleton = new Func, (bool, SymbolicRegexNode)>(concatNode => { pos += 1; @@ -257,6 +263,12 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node addSingleton(current), {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => addFixedLengthLoop(current), + { + _kind: SymbolicRegexNodeKind.Concat, + _left._info.ContainsSomeAnchor:true, + _right._kind: SymbolicRegexNodeKind.Concat + } => + bail(current), _ => (false, current) }; canLoop = loop; From c6ad3ac3560c865feb9fbda0a48e30e742288a6c Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 19 Jun 2024 23:36:16 +0300 Subject: [PATCH 23/63] anchor edge cases --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 10da6f1af060f..e5b6cf1d1ed9b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -253,6 +253,10 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node #endif (bool loop, SymbolicRegexNode next) = current switch { + // This could potentially be a very good future optimization for + // anchors but there's too many edge cases to guarantee it works. + // one example which fails currently: pattern: @"\By\b", input: "xy" + { _info.ContainsSomeAnchor: true } => bail(current), // if this is reached then entire match is fixed length { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} => @@ -263,12 +267,6 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node addSingleton(current), {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => addFixedLengthLoop(current), - { - _kind: SymbolicRegexNodeKind.Concat, - _left._info.ContainsSomeAnchor:true, - _right._kind: SymbolicRegexNodeKind.Concat - } => - bail(current), _ => (false, current) }; canLoop = loop; From e10b43f3ba60b04da8ffcd94b0856cd62ef4a4e6 Mon Sep 17 00:00:00 2001 From: ieviev <36763595+ieviev@users.noreply.github.com> Date: Thu, 20 Jun 2024 00:26:41 +0300 Subject: [PATCH 24/63] Apply suggestions from code review Co-authored-by: Stephen Toub --- .../Symbolic/MatchingState.cs | 7 +- .../Symbolic/MintermClassifier.cs | 1 + .../Symbolic/RegexNodeConverter.cs | 85 ------------------- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 4 +- .../Symbolic/SymbolicRegexMatcher.cs | 8 +- 5 files changed, 9 insertions(+), 96 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 5bd2baf668d3d..1622107e8d9ce 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -20,7 +20,7 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) /// /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor /// - internal readonly int NullabilityInfo; + internal int NullabilityInfo { get; } /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -154,7 +154,8 @@ internal StateFlags BuildStateFlags(bool isInitial) } /// - /// nullability for each context is encoded in a bit + /// Builds the nullability information for the matching statie. + /// Nullability for each context is encoded in a bit /// 0 means node cannot be nullable /// 00001 -> nullable for General /// 00010 -> nullable for BeginningEnd @@ -162,7 +163,6 @@ internal StateFlags BuildStateFlags(bool isInitial) /// 01000 -> nullable for NewLineS /// 10000 -> nullable for WordLetter /// - /// internal byte BuildNullabilityInfo() { byte nullabilityInfo = 0; @@ -173,6 +173,7 @@ internal byte BuildNullabilityInfo() nullabilityInfo |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0); } } + return nullabilityInfo; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 96ab22ce8f967..3ee908ffd0f06 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -102,6 +102,7 @@ public int GetMintermID(int c) { return 0; } + // high performance variant would use a span directly. // additional memory is saved by using a byte return _intLookup is null ? _lookup![c] : _intLookup[c]; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 31f01271d558b..9194ca00c971c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -527,90 +527,5 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code) } } } - - - // /// - // /// attempt to remove anchors when possible since it reduces overhead - // /// more rewrites could be tried but it's important to preserve PCRE semantics - // /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner - // /// - // /// - // /// - // /// - // internal static SymbolicRegexNode ApplyRootRewrites(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode) - // { - // // only consider removing anchors, otherwise bail - // if (!rootNode._info.ContainsSomeAnchor) return rootNode; - - // // Func _wout = st => - // // { - // // var a_cons = System.Reflection.Assembly.Load("System.Console"); - // // var t_cons = a_cons.GetType("System.Console")!; - // // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); - // // wl!.Invoke(null, [st]); - // // return true; - // // }; - - // SymbolicRegexNode ApplyRewrites(SymbolicRegexNode node) - // { - // // Guard against stack overflow due to deep recursion - // if (!StackHelper.TryEnsureSufficientExecutionStack()) - // { - // return StackHelper.CallOnEmptyStack(() => ApplyRewrites(node)); - // } - - // var wl = UnicodeCategoryConditions.WordLetter(builder._charSetSolver); - - // switch (node._kind) - // { - // case SymbolicRegexNodeKind.Concat: - // // _wout($"conc: l:{node._left!._kind} r:{node._right!._kind}"); - // switch (node._left!._kind) - // { - // case SymbolicRegexNodeKind.CaptureStart: - // return builder.CreateConcat(node._left, ApplyRewrites(node._right!)); - // case SymbolicRegexNodeKind.BoundaryAnchor: - // return node._right! switch - // { - // // \b\w{1,}.. -> \w{1,} - // // anchor to the left can be removed - // { - // _kind: SymbolicRegexNodeKind.Concat, _left: - // { - // _kind: SymbolicRegexNodeKind.Loop, _lower: >= 1, _upper: >= int.MaxValue - - // } wordLoop - // } - // when (wordLoop!._left!._kind == SymbolicRegexNodeKind.Singleton) && wordLoop!._left._set.Equals(wl) => ApplyRewrites(node._right!), - // _ => node - // }; - // case SymbolicRegexNodeKind.Loop: - // var loopnode = node._left!; - // // +, {2,}, {3,} anything infinite is a valid rewrite, star is an anchor edge case - // bool isPlusInfinite = loopnode._upper == int.MaxValue && loopnode._lower >= 1; - // bool isWordChar = (loopnode._left!._kind == SymbolicRegexNodeKind.Singleton) && loopnode._left._set.Equals(wl); - // return node._right! switch - // { - // // anchor to the right can be removed - // { - // _kind: SymbolicRegexNodeKind.Concat, - // _left.Kind: SymbolicRegexNodeKind.BoundaryAnchor, - // _right._kind: SymbolicRegexNodeKind.CaptureEnd - // } when isPlusInfinite && isWordChar => builder.CreateConcat(loopnode, ApplyRewrites(node._right!._right!)), - // _ => node - // }; - // } - // return node; - - - // default: - // return node; - // } - // } - - // SymbolicRegexNode rewritten = ApplyRewrites(rootNode); - // // _wout(rewritten.ToString()); - // return rewritten; - // } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index e5b6cf1d1ed9b..2a573d9eee285 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -138,9 +138,6 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) /// /// Pre-computed hot-loop version of nullability check /// - /// - /// - /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private bool IsNullableWithContext(int stateId, int mintermId) => ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0; @@ -243,6 +240,7 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node return (false, concatNode); } }); + while (canLoop) { #if DEBUG diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 76614272556f6..1bdc128a34899 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -233,8 +233,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder _optimizedReversalState.Kind != MatchReversalKind.FixedLength, }; }); - // a DFA is sometimes 10x-100x faster than the optimizations - // the "IsUseful" is harming the engine here + + // In some cases where the findOptimizations are useful, just using the DFA can still be faster. _findOpts = findOptimizations switch { { FindMode: FindNextStartingPositionMode.FixedDistanceString_LeftToRight } => findOptimizations, @@ -242,7 +242,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder setIsTooCommon( findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations, - _ => findOptimizations // TODO: unsure which options are left here + _ => findOptimizations }; // _findOpts = findOptimizations; // _findOpts = null; @@ -349,8 +349,6 @@ uint CalculateMintermIdKind(int mintermId) /// internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize); - /// TODO: when you're calling a function millions of times per second even this add 1 does cost something - /// this should be ideally remapped /// Look up what is the character kind given a position ID [MethodImpl(MethodImplOptions.AggressiveInlining)] private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1]; From f581755eeef6d0287c63f94c6e5fda112d32a9da Mon Sep 17 00:00:00 2001 From: ieviev <36763595+ieviev@users.noreply.github.com> Date: Thu, 27 Jun 2024 14:39:33 +0300 Subject: [PATCH 25/63] Apply suggestions from code review Co-authored-by: Stephen Toub --- .../src/System.Text.RegularExpressions.csproj | 3 --- .../Symbolic/MatchingState.cs | 2 +- .../Symbolic/MintermClassifier.cs | 16 +++++++------- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 9 -------- .../Symbolic/SymbolicRegexMatcher.cs | 21 ++++++------------- 5 files changed, 14 insertions(+), 37 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index b3fda5f2f4326..5ec4d230d7ba5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -5,9 +5,6 @@ true $(DefineConstants);SYSTEM_TEXT_REGULAREXPRESSIONS false - - - diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 1622107e8d9ce..41251ccc82c83 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -154,7 +154,7 @@ internal StateFlags BuildStateFlags(bool isInitial) } /// - /// Builds the nullability information for the matching statie. + /// Builds the nullability information for the matching state. /// Nullability for each context is encoded in a bit /// 0 means node cannot be nullable /// 00001 -> nullable for General diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 3ee908ffd0f06..fc8a1abdd55d9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -27,10 +27,9 @@ internal sealed class MintermClassifier /// Conserve memory if pattern is ascii-only private readonly bool _isAsciiOnly; - // /// - // /// fallback lookup if over 255 minterms - // /// this is almost never used - // /// + /// + /// Fallback lookup if over 255 minterms. This is rarely used. + /// private readonly int[]? _intLookup; /// Create a classifier that maps a character to the ID of its associated minterm. @@ -116,17 +115,16 @@ public int GetMintermID(int c) public bool IsAsciiOnly() => _isAsciiOnly; /// - /// Quick mapping from char to minterm, - /// can be null if there is over 255 minterms + /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms. + /// Null if there are greater than 255 minterms. /// - /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public byte[]? ByteLookup() => _lookup; /// - /// Int lookup for rare cases + /// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms. + /// Null in the common case where there are fewer than 255 minterms. /// - /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public int[]? IntLookup() => _intLookup; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 2a573d9eee285..0925738c9a41e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -59,15 +59,6 @@ internal sealed partial class SymbolicRegexMatcher /// private bool[] _canBeAcceleratedArray; -#if DEBUG - // private readonly Action _wout = st => - // { - // var a_cons = System.Reflection.Assembly.Load("System.Console"); - // var t_cons = a_cons.GetType("System.Console")!; - // var wl = t_cons.GetMethod("WriteLine", [typeof(string)]); - // wl!.Invoke(null, [st]); - // }; -#endif /// /// The transition function for DFA mode. /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 1bdc128a34899..0e285be987fec 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -218,9 +218,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder((fds) => { - // _wout($"rn{fds.Range is null}"); - // _wout($"cn{fds.Chars is null}"); - // _wout($"cc{fds.Chars!.Length}"); return fds switch { { Chars: not null } => @@ -237,18 +234,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder findOptimizations, - { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } => - findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke)? null : findOptimizations, - { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } => setIsTooCommon( - findOptimizations.FixedDistanceSets![0]) ? null : findOptimizations, + { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke) => null, + { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when setIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null, _ => findOptimizations }; - // _findOpts = findOptimizations; - // _findOpts = null; - // _wout($"{findOptimizations.FindMode}"); - // _wout($"{findOptimizations.FixedDistanceSets![0]}"); - // _wout($"o{_findOpts}"); } // Determine the number of initial states. If there's no anchor, only the default previous @@ -485,6 +474,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i case MatchReversalKind.FixedLength: matchStart = (matchEnd - _optimizedReversalState.FixedLength); break; + case MatchReversalKind.MatchStart: case MatchReversalKind.PartialFixedLength: int initialLastStart = -1; // invalid sentinel value @@ -581,6 +571,7 @@ private int FindEndPositionOptimized(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); + // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. if (done || pos >= input.Length) @@ -1856,6 +1847,7 @@ public static bool TryFindNextStartingPosition(SymbolicRe { if (currentStateId != initialStateId) return false; + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { return true; @@ -1878,6 +1870,7 @@ public static bool TryFindNextStartingPosition(SymbolicRe { if (currentStateId != initialStateId) return false; + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { currentStateId = matcher._dotstarredInitialStates[ @@ -1975,8 +1968,6 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche where TStateHandler : struct, IStateHandler { return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); - // cannot be used in NFA mode - // return matcher.IsNullableWithContext(state.DfaStateId, positionId); } } } From 01a9684d65fea316ef062a2c10978939e6fd900f Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 27 Jun 2024 18:20:59 +0300 Subject: [PATCH 26/63] rebased branch and some cleanup --- .../src/System.Text.RegularExpressions.csproj | 4 ++-- .../Text/RegularExpressions/Symbolic/MatchReversal.cs | 2 +- .../Text/RegularExpressions/Symbolic/MatchingState.cs | 8 ++------ .../System/Text/RegularExpressions/Symbolic/StateFlags.cs | 5 ----- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 5ec4d230d7ba5..86353b31b5d7b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -72,6 +72,8 @@ + + @@ -93,8 +95,6 @@ - - diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs index 215aa65a1d14f..cd00755dbe6dc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs @@ -3,7 +3,7 @@ namespace System.Text.RegularExpressions.Symbolic; -internal sealed class MatchReversal( +internal readonly struct MatchReversal( MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 41251ccc82c83..55032b39d9bb1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -17,9 +17,6 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) NullabilityInfo = BuildNullabilityInfo(); } - /// - /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor - /// internal int NullabilityInfo { get; } /// The regular expression that labels this state and gives it its semantics. @@ -102,8 +99,7 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m } /// - /// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned - /// Use + /// Cached nullability check with encoded bits /// whereever possible /// [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -170,7 +166,7 @@ internal byte BuildNullabilityInfo() { for (uint ck = 0; ck < CharKind.CharKindCount; ck++) { - nullabilityInfo |= (byte)(IsNullableForInit(ck) ? 1 << (int)ck : 0); + nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs index 990eb4807c7f1..c1628ebbcf312 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs @@ -28,18 +28,13 @@ internal enum StateFlags : byte /// internal static class StateFlagsExtensions { - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None; - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None; - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None; - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None; - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != StateFlags.None; } } From 341ce27f1ce66c17916b22ef40ec94f433d77129 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 27 Jun 2024 18:40:59 +0300 Subject: [PATCH 27/63] cleanup, removing unused features --- .../System/Text/RegularExpressions/Symbolic/StateFlags.cs | 5 ----- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 6 ------ .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 7 ++----- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs index c1628ebbcf312..a342aff09b6b8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs @@ -20,7 +20,6 @@ internal enum StateFlags : byte IsNullableFlag = 4, CanBeNullableFlag = 8, SimulatesBacktrackingFlag = 16, - IsAcceleratedFlag = 32, } /// @@ -30,11 +29,7 @@ internal static class StateFlagsExtensions { internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None; internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None; - internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None; - internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None; - - internal static bool IsAccelerated(this StateFlags info) => (info & (StateFlags.IsAcceleratedFlag | StateFlags.IsInitialFlag)) != StateFlags.None; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 0925738c9a41e..65bd8834b6508 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -54,10 +54,6 @@ internal sealed partial class SymbolicRegexMatcher /// private byte[] _nullabilityArray; - /// - /// Used to short-circuit accelerated states in the hot loop - /// - private bool[] _canBeAcceleratedArray; /// /// The transition function for DFA mode. @@ -301,12 +297,10 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog); ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize); ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize); - ArrayResizeAndVolatilePublish(ref _canBeAcceleratedArray, newsize); } _stateArray[state.Id] = state; _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState); _nullabilityArray[state.Id] = state.BuildNullabilityInfo(); - _canBeAcceleratedArray[state.Id] = _stateFlagsArray[state.Id].IsAccelerated(); } return state; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 0e285be987fec..3efb0d7db75ef 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -191,7 +191,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; _nullabilityArray = new byte[InitialDfaStateCapacity]; - _canBeAcceleratedArray = new bool[InitialDfaStateCapacity]; _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog]; // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm @@ -361,7 +360,6 @@ internal TSet GetMintermFromId(int mintermId) return minterms[mintermId]; } - /// TODO: this if-else branch could be called once. it's currently causing overhead on every single step [MethodImpl(MethodImplOptions.AggressiveInlining)] private uint GetCharKind(ReadOnlySpan input, int i) where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ? @@ -871,7 +869,6 @@ private bool FindEndPositionDeltas - /// TODO: this is a separate DFA function that takes advantage of short circuit array lookups /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. @@ -901,6 +898,7 @@ private bool FindEndPositionDeltasDFA(this, input, ref state, ref pos)) { @@ -1866,7 +1864,6 @@ public static bool TryFindNextStartingPosition(SymbolicRe byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) where TOptimizedInputReader : struct, IOptimizedInputReader - { if (currentStateId != initialStateId) return false; From 1a28c69f925d72f9fe8837ef58ae47fe41b2e13b Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 27 Jun 2024 19:36:02 +0300 Subject: [PATCH 28/63] cleanup --- .../Symbolic/SymbolicRegexMatcher.cs | 95 +------------------ 1 file changed, 2 insertions(+), 93 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 3efb0d7db75ef..61d3bd99cccec 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -564,11 +564,13 @@ private int FindEndPositionOptimized(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); + } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -668,99 +670,6 @@ private int FindEndPositionFallback - /// Workhorse inner loop for . Consumes the character by character, - /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, - /// lazily building out the graph as needed. - /// - /// - /// The supplies the actual transitioning logic, controlling whether processing is - /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, - /// so for example if is a , it expects the 's - /// to be non-negative and its to be null; vice versa for - /// . - /// - /// - /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. - /// 0 if iteration completed because we reached an initial state. - /// A negative value if iteration completed because we ran out of input or we failed to transition. - /// - private bool FindEndPositionDeltas(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) - where TStateHandler : struct, IStateHandler - where TInputReader : struct, IInputReader - where TFindOptimizationsHandler : struct, IInitialStateHandler - where TNullabilityHandler : struct, INullabilityHandler - { - // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. - int pos = posRef; - int endPos = endPosRef; - // int endStateId = endStateIdRef; - int initialStatePos = initialStatePosRef; - int initialStatePosCandidate = initialStatePosCandidateRef; - try - { - // Loop through each character in the input, transitioning from state to state for each. - while (true) - { - StateFlags flags = TStateHandler.GetStateFlags(this, in state); - - // Check if currentState represents an initial state. If it does, call into any possible find optimizations - // to hopefully more quickly find the next possible starting location. - if (flags.IsInitial()) - { - if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) - { - return true; - } - - initialStatePosCandidate = pos; - } - - // If the state is a dead end, such that we can't transition anywhere else, end the search. - if (state.DfaStateId == _deadStateId) - { - return true; - } - - int positionId = TInputReader.GetPositionId(this, input, pos); - - // If the state is nullable for the next character, meaning it accepts the empty string, - // we found a potential end state. - if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) - { - endPos = pos; - // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); - initialStatePos = initialStatePosCandidate; - - // A match is known to exist. If that's all we need to know, we're done. - if (mode == RegexRunnerMode.ExistenceRequired) - { - return true; - } - } - - // If there is more input available try to transition with the next character. - if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) - { - return false; - } - - // We successfully transitioned, so update our current input index to match. - pos++; - } - } - finally - { - // Write back the local copies of the ref values. - posRef = pos; - endPosRef = endPos; - // endStateIdRef = endStateId; - initialStatePosRef = initialStatePos; - initialStatePosCandidateRef = initialStatePosCandidate; - } - } - /// /// tbd From 9bba84fc2b8c0bcc8e1efe6ff2ae415833a94563 Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 30 Jun 2024 01:04:05 +0300 Subject: [PATCH 29/63] timeout limit changes --- .../Symbolic/SymbolicRegexMatcher.cs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 61d3bd99cccec..0643d2fe9750f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -549,20 +549,22 @@ private int FindEndPositionOptimized CharsPerTimeoutCheck ? pos + CharsPerTimeoutCheck : input.Length; bool done; if (currentState.NfaState is null) + { done = FindEndPositionDeltasDFAOptimized(input, innerLoopLength - 1, mode, ref pos, currentState.DfaStateId, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); + } else { // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here @@ -629,7 +631,9 @@ private int FindEndPositionFallback CharsPerTimeoutCheck ? pos + CharsPerTimeoutCheck : input.Length; From a9577815b8b0a50f46de7bfd01aea8f9110cab5e Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 30 Jun 2024 15:52:40 +0300 Subject: [PATCH 30/63] lookup allocation threshold and timeout limits --- .../Symbolic/MintermClassifier.cs | 55 +++++++++------- .../Symbolic/SymbolicRegexMatcher.cs | 66 +++++++++++-------- .../FunctionalTests/Regex.Match.Tests.cs | 2 +- 3 files changed, 72 insertions(+), 51 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index fc8a1abdd55d9..88f3de35b23f9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -20,18 +20,20 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { - private static readonly byte[] s_emptyLookup = new byte[ushort.MaxValue + 1]; /// An array used to map characters to minterms private readonly byte[]? _lookup; - /// Conserve memory if pattern is ascii-only - private readonly bool _isAsciiOnly; - /// /// Fallback lookup if over 255 minterms. This is rarely used. /// private readonly int[]? _intLookup; + + /// + /// Maximum ordinal character for a non-0 minterm, used to conserve memory + /// + private readonly int _maxChar; + /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. public MintermClassifier(BDD[] minterms) @@ -42,18 +44,24 @@ public MintermClassifier(BDD[] minterms) if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). - _lookup = s_emptyLookup; + _lookup = Array.Empty(); return; } - // ascii-only array to save memory - _isAsciiOnly = true; + // attempt to save memory in common cases by allocating only up to the highest char code for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - if (BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2 >= 128) - { - _isAsciiOnly = false; - } + _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2); + } + // increment by 1 to fit the highest character code in the 0-based array as well + _maxChar += 1; + + // the trade-off is somewhere around 5% performance for a higher initial allocation. + // past a certain threshold where the maxChar is already large, + // the full 64k can be allocated and OptimizedFullInputReader can be used + if (_maxChar > 32_000) + { + _maxChar = ushort.MaxValue + 1; } // It's incredibly rare for a regex to use more than a hundred or two minterms, @@ -61,7 +69,7 @@ public MintermClassifier(BDD[] minterms) if (minterms.Length > 255) { // over 255 unique sets also means it's never ascii only - int[] lookup = new int[ushort.MaxValue + 1]; + int[] lookup = new int[_maxChar]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { // precompute all assigned minterm categories @@ -77,7 +85,7 @@ public MintermClassifier(BDD[] minterms) } else { - byte[] lookup = new byte[_isAsciiOnly ? 128 : ushort.MaxValue + 1]; + byte[] lookup = new byte[_maxChar]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { // precompute all assigned minterm categories @@ -97,23 +105,14 @@ public MintermClassifier(BDD[] minterms) [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { - if (_isAsciiOnly && (c >= 128)) + if (c > _maxChar) { return 0; } - // high performance variant would use a span directly. - // additional memory is saved by using a byte + // high performance inner-loop variant uses the array directly return _intLookup is null ? _lookup![c] : _intLookup[c]; } - - /// - /// Whether to use the low memory ascii-only hot loop or the full loop - /// - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool IsAsciiOnly() => _isAsciiOnly; - /// /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms. /// Null if there are greater than 255 minterms. @@ -127,5 +126,13 @@ public int GetMintermID(int c) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public int[]? IntLookup() => _intLookup; + + /// + /// Whether the full 64K char lookup is allocated. + /// This accelerates the minterm mapping by removing an if-else case, + /// and is only considered for the common <= 255 minterms case + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsFullLookup() => _lookup is not null && _lookup.Length == ushort.MaxValue + 1; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 0643d2fe9750f..18e2c256e0854 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -404,31 +404,31 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null)) { - matchEnd = (_mintermClassifier.IsAsciiOnly(), _findOpts is not null, _containsAnyAnchor) switch + matchEnd = (_mintermClassifier.IsFullLookup(), _findOpts is not null, _containsAnyAnchor) switch { - (true, true, true) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, true, false) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false, false) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false, true) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true, false) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true, true) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false, false) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false, true) => - FindEndPositionOptimized + FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } @@ -704,6 +704,7 @@ private int FindEndPositionFallback= lengthMinus1) { if (pos + 1 < input.Length) @@ -1633,26 +1634,39 @@ public static void UndoTransition(ref CurrentState state) #endif } + /// + /// This input reader attempts to minimize overhead + /// by handling constraints outside of the loop: + /// 1. the position must be already valid for the input. + /// 2. the pattern must not to contain \Z. + /// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm + /// private interface IOptimizedInputReader { - public static abstract int GetPositionId(byte[] lookup, ReadOnlySpan input, + public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos); } - private readonly struct OptimizedAsciiInputReader : IOptimizedInputReader + /// + /// This reader maps all characters > maxChar to 0 + /// + private readonly struct OptimizedSmallInputReader : IOptimizedInputReader { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos) + public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos) { Debug.Assert(pos < input.Length); - return input[pos] >= 128 ? 0 : lookup[input[pos]]; + return input[pos] > maxChar ? 0 : lookup[input[pos]]; } } - private readonly struct OptimizedUnicodeInputReader : IOptimizedInputReader + /// + /// This reader is effectively an array lookup for the full 64k utf16 code unit mapping + /// + private readonly struct OptimizedFullInputReader : IOptimizedInputReader { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int GetPositionId(byte[] lookup, ReadOnlySpan input, int pos) + public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos) { Debug.Assert(pos < input.Length); Debug.Assert(lookup.Length == (ushort.MaxValue + 1)); @@ -1686,7 +1700,7 @@ public static bool IsNullable(SymbolicRegexMatcher where TOptimizedInputReader : struct, IOptimizedInputReader { Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}"); - return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, input, pos)); + return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos)); } } @@ -1784,7 +1798,7 @@ public static bool TryFindNextStartingPosition(SymbolicRe if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { currentStateId = matcher._dotstarredInitialStates[ - matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, input, pos - 1) + 1] + matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1] ].Id; return true; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 57780531253d3..94ef063f1c079 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck() { // This constant must be at least as large as the one in the implementation that sets the maximum number // of innermost loop iterations between timeout checks. - const int CharsToTriggerTimeoutCheck = 10000; + const int CharsToTriggerTimeoutCheck = 25000; // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger. Assert.Throws(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1)) .Match(new string('a', CharsToTriggerTimeoutCheck))); From 7e86855a92b880a5b30b6be35c1b7007c130ba53 Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 30 Jun 2024 18:16:49 +0300 Subject: [PATCH 31/63] char mapping --- .../Symbolic/MintermClassifier.cs | 16 ++++++++++------ .../Symbolic/SymbolicRegexMatcher.cs | 5 +++-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 88f3de35b23f9..029ee00f9d8a0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -31,6 +31,7 @@ internal sealed class MintermClassifier /// /// Maximum ordinal character for a non-0 minterm, used to conserve memory + /// Note: this is maximum index allowed for the lookup, the array size is _maxChar + 1 /// private readonly int _maxChar; @@ -53,15 +54,12 @@ public MintermClassifier(BDD[] minterms) { _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2); } - // increment by 1 to fit the highest character code in the 0-based array as well - _maxChar += 1; - // the trade-off is somewhere around 5% performance for a higher initial allocation. // past a certain threshold where the maxChar is already large, // the full 64k can be allocated and OptimizedFullInputReader can be used if (_maxChar > 32_000) { - _maxChar = ushort.MaxValue + 1; + _maxChar = ushort.MaxValue; } // It's incredibly rare for a regex to use more than a hundred or two minterms, @@ -69,7 +67,7 @@ public MintermClassifier(BDD[] minterms) if (minterms.Length > 255) { // over 255 unique sets also means it's never ascii only - int[] lookup = new int[_maxChar]; + int[] lookup = new int[_maxChar + 1]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { // precompute all assigned minterm categories @@ -85,7 +83,7 @@ public MintermClassifier(BDD[] minterms) } else { - byte[] lookup = new byte[_maxChar]; + byte[] lookup = new byte[_maxChar + 1]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { // precompute all assigned minterm categories @@ -134,5 +132,11 @@ public int GetMintermID(int c) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool IsFullLookup() => _lookup is not null && _lookup.Length == ushort.MaxValue + 1; + + /// + /// Maximum ordinal character for a non-0 minterm, used to conserve memory + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int MaxChar() => _maxChar; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 18e2c256e0854..34ddab8f63533 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -704,7 +704,7 @@ private int FindEndPositionFallback input, int pos) { - Debug.Assert(pos < input.Length); + Debug.Assert(pos < input.Length, "pos < input.Length"); + Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}"); return input[pos] > maxChar ? 0 : lookup[input[pos]]; } } From 99b5717e61f0f7ee2e22597ae5ac69defe689dd4 Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 30 Jun 2024 20:09:36 +0300 Subject: [PATCH 32/63] empty array mapping --- .../System/Text/RegularExpressions/Symbolic/MintermClassifier.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 029ee00f9d8a0..12aaf3ce02c60 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -46,6 +46,7 @@ public MintermClassifier(BDD[] minterms) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). _lookup = Array.Empty(); + _maxChar = -1; return; } From 47c6b0477cfb55cfd9ddf1b2a4ae81bd742fdba9 Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 30 Jun 2024 23:27:51 +0300 Subject: [PATCH 33/63] adding timeout check to create-derivative --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 17 ++-- .../Symbolic/SymbolicRegexMatcher.cs | 94 ++++++++++++------- .../Symbolic/SymbolicRegexThresholds.cs | 15 ++- 3 files changed, 77 insertions(+), 49 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 65bd8834b6508..fd16805c7d455 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -230,12 +230,6 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node while (canLoop) { -#if DEBUG - // if (current._left is null) - // _wout($"NULL {current._kind}"); - // else - // _wout($"{pos} {current._kind} l:{current._left!._kind} {current}"); -#endif (bool loop, SymbolicRegexNode next) = current switch { // This could potentially be a very good future optimization for @@ -386,16 +380,23 @@ private int GetCoreStateId(int nfaStateId) /// Gets or creates a new DFA transition. /// This function locks the matcher for safe concurrent use of the private bool TryCreateNewTransition( - MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState) + MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState, + long timeoutOccursAt = 0) { Debug.Assert(offset < _dfaDelta.Length); - lock (this) { // check if meanwhile delta[offset] has become defined possibly by another thread MatchingState? targetState = _stateArray[_dfaDelta[offset]]; if (targetState is null) { + // check if there is an active timer + if (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) + { + nextState = null; + return false; + } + if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) { nextState = null; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 34ddab8f63533..500221782b363 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -538,7 +538,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } } - private int FindEndPositionOptimized(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) + private int FindEndPositionOptimized( + ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler @@ -549,28 +550,32 @@ private int FindEndPositionOptimized CharsPerTimeoutCheck ? - pos + CharsPerTimeoutCheck : - input.Length; - + int innerLoopLength; bool done; if (currentState.NfaState is null) { + const int dfaCharsPerTimeoutCheck = 100000; + innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck + ? pos + dfaCharsPerTimeoutCheck + : input.Length; done = FindEndPositionDeltasDFAOptimized(input, innerLoopLength - 1, mode, ref pos, currentState.DfaStateId, ref endPos, ref initialStatePosCandidate, - ref initialStatePosCandidate); + ref initialStatePosCandidate, timeoutOccursAt); } else { // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here + // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms + const int nfaCharsPerTimeoutCheck = 1000; + innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck + ? pos + nfaCharsPerTimeoutCheck + : input.Length; done = FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, + FullNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); } @@ -631,16 +636,34 @@ private int FindEndPositionFallback CharsPerTimeoutCheck ? - pos + CharsPerTimeoutCheck : - input.Length; - - bool done = currentState.NfaState is not null ? - FindEndPositionDeltasNFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate) : - FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); + // The fallback function has lower limits due to possibly worse performance + int innerLoopLength; + bool done; + if (currentState.NfaState is null) + { + const int dfaCharsPerTimeoutCheck = 25000; + innerLoopLength = _checkTimeout && input.Length - pos > dfaCharsPerTimeoutCheck + ? pos + dfaCharsPerTimeoutCheck + : input.Length; + done = + FindEndPositionDeltasDFA(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, + ref endStateId, ref initialStatePosCandidate, timeoutOccursAt); + } + else + { + // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here + // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms + const int nfaCharsPerTimeoutCheck = 1000; + innerLoopLength = _checkTimeout && input.Length - pos > nfaCharsPerTimeoutCheck + ? pos + nfaCharsPerTimeoutCheck + : input.Length; + done = + FindEndPositionDeltasNFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, + ref endStateId, ref initialStatePosCandidate); + } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -680,7 +703,7 @@ private int FindEndPositionFallback private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, - ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, long timeoutOccursAt) where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler @@ -748,7 +771,8 @@ private int FindEndPositionFallback= lengthMinus1) { if (pos + 1 < input.Length) @@ -800,7 +824,8 @@ private int FindEndPositionFallback private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, + long timeoutOccursAt) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler @@ -851,7 +876,7 @@ private bool FindEndPositionDeltasDFA= length || !TStateHandler.TryTakeTransition(this, ref state, - positionId)) + positionId, timeoutOccursAt)) { return false; } @@ -872,7 +897,6 @@ private bool FindEndPositionDeltasDFA - /// TODO: this is the fallback NFA function /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. @@ -889,7 +913,8 @@ private bool FindEndPositionDeltasDFA - private bool FindEndPositionDeltasNFA(ReadOnlySpan input, int length, RegexRunnerMode mode, + private bool FindEndPositionDeltasNFA( + ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader @@ -931,7 +956,7 @@ private bool FindEndPositionDeltasNFA= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt)) { return false; } @@ -1007,7 +1032,8 @@ private int FindStartPosition(CurrentState st /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindStartPositionDeltasDFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) + private bool FindStartPositionDeltasDFA( + ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler @@ -1037,7 +1063,7 @@ private bool FindStartPositionDeltasDFA matcher, in CurrentState state, uint nextCharKind); public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos); public static abstract int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); - public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId); + public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, long timeoutOccursAt); public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state); } @@ -1380,7 +1406,8 @@ public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentS /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, + long timeoutOccursAt) { Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}."); @@ -1412,7 +1439,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur /// Take the transition to the next DFA state without paying for the NFA structure [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, - int mintermId) + int mintermId, long timeoutOccursAt) { Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}."); // Use the mintermId for the character being read to look up which state to transition to. @@ -1429,7 +1456,7 @@ public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId, matcher.DeltaOffset(state, mintermId), - checkThreshold: true, out MatchingState? nextState)) + checkThreshold: true, out MatchingState? nextState, timeoutOccursAt)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. @@ -1516,7 +1543,8 @@ public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentStat } /// Take the transition to the next NFA state. - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, + long timeoutOccursAt = 0) { Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index d455f26da1dcf..f26009f035a57 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -22,25 +22,24 @@ internal static class SymbolicRegexThresholds /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex. /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing /// to create a new node and the graph is already or newly beyond this threshold. - /// TODO: summarize this - /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s - /// with an entirely different search-time algorithmic complexity - /// 100_000 isn't a really a high memory cost either, - /// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has + /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB; + /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially + /// significant search-time performance gains /// - internal const int NfaThreshold = 100_000; + internal const int NfaThreshold = 25_000; /// /// Default maximum estimated safe expansion size of a AST /// after the AST has been anlayzed for safe handling. - /// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too /// /// If the AST exceeds this threshold then is thrown. /// This default value may be overridden with the AppContext data /// whose name is given by . /// + /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s, + /// although it could be safely raised higher at the expense of worst-case NFA performance /// - internal const int DefaultSymbolicRegexSafeSizeThreshold = 1000; + internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; // nfa speed constraint ///The environment variable name for a value overriding the default value internal const string SymbolicRegexSafeSizeThreshold_ConfigKeyName = "REGEX_NONBACKTRACKING_MAX_AUTOMATA_SIZE"; From 22d23fad468dd00779fbccd90838787cfa5d7d71 Mon Sep 17 00:00:00 2001 From: ieviev Date: Sun, 30 Jun 2024 23:57:17 +0300 Subject: [PATCH 34/63] some cleanup --- .../RegularExpressions/Symbolic/MintermClassifier.cs | 6 +++--- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 12 ++++-------- .../Symbolic/SymbolicRegexThresholds.cs | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 12aaf3ce02c60..323cc6cdcc316 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -55,9 +55,9 @@ public MintermClassifier(BDD[] minterms) { _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2); } - // the trade-off is somewhere around 5% performance for a higher initial allocation. - // past a certain threshold where the maxChar is already large, - // the full 64k can be allocated and OptimizedFullInputReader can be used + // there is an opportunity to gain around 5% performance for allocating the + // full 64K, past a certain threshold where maxChar is already large. + // TODO: what should this threshold be? if (_maxChar > 32_000) { _maxChar = ushort.MaxValue; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index fd16805c7d455..e5430249a86a8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -390,14 +390,10 @@ private bool TryCreateNewTransition( MatchingState? targetState = _stateArray[_dfaDelta[offset]]; if (targetState is null) { - // check if there is an active timer - if (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) - { - nextState = null; - return false; - } - - if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) + if (// check if there is an active timer + (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || + // check if size exceeds the NFA threshold + (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)) { nextState = null; return false; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index f26009f035a57..5bcda9cfba731 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -39,7 +39,7 @@ internal static class SymbolicRegexThresholds /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s, /// although it could be safely raised higher at the expense of worst-case NFA performance /// - internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; // nfa speed constraint + internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; ///The environment variable name for a value overriding the default value internal const string SymbolicRegexSafeSizeThreshold_ConfigKeyName = "REGEX_NONBACKTRACKING_MAX_AUTOMATA_SIZE"; From 761f897bff33c3f0b94ecd800d6d9fa6b7e41b28 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 1 Jul 2024 01:48:29 +0300 Subject: [PATCH 35/63] comments and cleanup --- .../Symbolic/MatchingState.cs | 11 -- .../Symbolic/SymbolicRegexInfo.cs | 1 + .../Symbolic/SymbolicRegexMatcher.cs | 108 ++++++++++++------ 3 files changed, 73 insertions(+), 47 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 55032b39d9bb1..405be0318bbd5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -100,7 +100,6 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m /// /// Cached nullability check with encoded bits - /// whereever possible /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) @@ -108,16 +107,6 @@ internal bool IsNullableFor(uint nextCharKind) return ((1 << (int)nextCharKind) & NullabilityInfo) != 0; } - /// - /// Full nullability check for initialization - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal bool IsNullableForInit(uint nextCharKind) - { - Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); - return Node.IsNullableFor(CharKind.Context(PrevCharKind, nextCharKind)); - } - /// /// Builds a with the relevant flags set. /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index 750fbed4774bf..b0aa0cd6e938d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -55,6 +55,7 @@ private static SymbolicRegexInfo Create( public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0; public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0; + public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0; public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 500221782b363..037796198ed70 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -211,31 +211,30 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder((fds) => - { - return fds switch - { - { Chars: not null } => - // anything above 4 uint16 chars is generally slower than DFA - fds.Negated || - (fds.Chars.Length > 4 && - Array.Exists(fds.Chars, char.IsAsciiLetterLower)), - { Range: not null } => false, - // for fixed length strings just trust the optimizations - _ => _optimizedReversalState.Kind != MatchReversalKind.FixedLength, - }; - }); - // In some cases where the findOptimizations are useful, just using the DFA can still be faster. _findOpts = findOptimizations switch { - { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(setIsTooCommon.Invoke) => null, - { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when setIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null, - _ => findOptimizations + // for sets in fixed length patterns just trust the optimizations, + // the performance can be either better or worse depending on frequency + { + FindMode: + FindNextStartingPositionMode.FixedDistanceSets_LeftToRight or + FindNextStartingPositionMode.LeadingSet_LeftToRight} when + _optimizedReversalState.Kind != MatchReversalKind.FixedLength => findOptimizations, + // string literals are the best case + { + FindMode: + FindNextStartingPositionMode.LeadingString_LeftToRight or + FindNextStartingPositionMode.FixedDistanceString_LeftToRight or + FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight + } => findOptimizations, + // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null + { FindMode: FindNextStartingPositionMode.LeadingStrings_LeftToRight } => findOptimizations, + { FindMode: FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight } => findOptimizations, + // for singular character sets it depends if there's any reasonably small set to be accelerated + { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => null, + { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null, + _ => null }; } @@ -291,6 +290,36 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder implementations to avoid: + // - ProbabilisticCharSearchValues + // - ProbabilisticWithAsciiCharSearchValues`1 + // - AsciiCharSearchValues`1 + // - Any5SearchValues`2" + // SearchValues implementations to avoid: + // - StringSearchValuesAhoCorasick`2 + bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet) + { + return fixedDistanceSet switch + { + // anything above 4 uint16 chars is generally slower than DFA + { Chars: not null } => + // negated sets are usually large + fixedDistanceSet.Negated || + (fixedDistanceSet.Chars.Length > 4 + // TODO: this extra condition is currently kept so there's no regressions + // if ~500mb/s worst case is acceptable then this could be removed + // but being able to guess which character sets are not too frequent can + // often reach over 1gb/s with AVX + && Array.Exists(fixedDistanceSet.Chars, char.IsAsciiLetterLower)), + { Range: not null } => false, + _ => false, + }; + } + // Maps a minterm ID to a character kind uint CalculateMintermIdKind(int mintermId) { @@ -561,9 +590,9 @@ private int FindEndPositionOptimized(input, innerLoopLength - 1, mode, ref pos, + TOptimizedNullabilityHandler>(input, innerLoopLength - 1, mode, timeoutOccursAt, ref pos, currentState.DfaStateId, ref endPos, ref initialStatePosCandidate, - ref initialStatePosCandidate, timeoutOccursAt); + ref initialStatePosCandidate); } else { @@ -648,8 +677,8 @@ private int FindEndPositionFallback(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePosCandidate, timeoutOccursAt); + TNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, + ref endStateId, ref initialStatePosCandidate); } else { @@ -699,11 +728,14 @@ private int FindEndPositionFallback - /// tbd + /// This version of uses a different set of interfaces, + /// which don't check for many inner loop edge cases e.g. input end or '\n'. + /// All edge cases are handled before entering the loop. /// private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, - ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, long timeoutOccursAt) + long timeoutOccursAt, ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, + ref int initialStatePosCandidateRef) where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler @@ -742,8 +774,8 @@ private int FindEndPositionFallback( this, mtlookup, input, ref currStateId, ref pos, initialStateId)) { - // future work could combine this with an immediate state transition - // but this requires changing too much for now + // a good potential future optimization here would + // be to combine this with an immediate state transition if (pos == input.Length) { // patterns such as ^$ can be nullable right away @@ -769,7 +801,7 @@ private int FindEndPositionFallback private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef, - long timeoutOccursAt) + long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef + ) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler @@ -1666,7 +1698,7 @@ public static void UndoTransition(ref CurrentState state) /// This input reader attempts to minimize overhead /// by handling constraints outside of the loop: /// 1. the position must be already valid for the input. - /// 2. the pattern must not to contain \Z. + /// 2. the pattern must not contain \Z. /// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm /// private interface IOptimizedInputReader @@ -1690,7 +1722,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan i } /// - /// This reader is effectively an array lookup for the full 64k utf16 code unit mapping + /// This reader is effectively an array lookup for the all utf16 code units /// private readonly struct OptimizedFullInputReader : IOptimizedInputReader { @@ -1703,6 +1735,10 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan i } } + /// + /// This nullability handler interface can be used in DFAs + /// for patterns that do not contain \Z + /// private interface IOptimizedNullabilityHandler { public static abstract bool IsNullable(SymbolicRegexMatcher matcher, @@ -1728,7 +1764,7 @@ public static bool IsNullable(SymbolicRegexMatcher byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader { - Debug.Assert(pos < input.Length, $"input end should not be handled here {input}, pat:{matcher._dotstarredInitialStates[CharKind.General].Node}"); + Debug.Assert(pos < input.Length, $"input end should not be handled here"); return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos)); } } From 53924eb9ac148b8ad236d855c4eafb0dd4eb4fee Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 1 Jul 2024 02:38:26 +0300 Subject: [PATCH 36/63] cleanup and comments --- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 9 ++------- .../Symbolic/SymbolicRegexRunnerFactory.cs | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 037796198ed70..fef5b3ece17d3 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -665,8 +665,7 @@ private int FindEndPositionFallback 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; return endPos; } @@ -1468,7 +1463,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur return false; } - /// Take the transition to the next DFA state without paying for the NFA structure + /// Transition function that only considers DFA state id [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, int mintermId, long timeoutOccursAt) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index c046531f8a295..aa6708a60d01a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -21,7 +21,6 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root); - // rootNode = RegexNodeConverter.ApplyRootRewrites(bddBuilder, rootNode); // Determine if the root node is supported for safe handling int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(); From e66d3d37575371fab6777366e4d8d2bb6c9b0f31 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 1 Jul 2024 14:51:58 +0300 Subject: [PATCH 37/63] reflecting new limits in tests --- .../Symbolic/SymbolicRegexMatcher.cs | 21 ++++++++++++------- .../Symbolic/SymbolicRegexThresholds.cs | 2 +- .../FunctionalTests/Regex.Match.Tests.cs | 2 +- .../tests/UnitTests/SymbolicRegexTests.cs | 6 +++--- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index fef5b3ece17d3..c95c1802c2a44 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -84,7 +84,7 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Dead end state to quickly return NoMatch, this could potentially be a constant private readonly int _deadStateId; - /// Initial state used to for vectorization + /// Initial state used for vectorization private readonly int _initialStateId; /// Whether the pattern contains any anchor @@ -785,7 +785,8 @@ private int FindEndPositionFallback(this, _nullabilityArray, currStateId, mtlookup, input, pos)) + if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray, currStateId, mtlookup, + maxChar, input, pos)) { endPos = pos; // A match is known to exist. If that's all we need to know, we're done. @@ -1738,16 +1739,20 @@ private interface IOptimizedNullabilityHandler { public static abstract bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int - currStateId, byte[] lookup, ReadOnlySpan input, int pos) + currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader; } private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, + int maxChar, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader { + Debug.Assert(pos < input.Length, "input end should not be handled here"); + Debug.Assert(currStateId < nullabilityArray.Length, + "nullabilityArray grown but the reference is not up to date"); return nullabilityArray[currStateId] > 0; } } @@ -1756,11 +1761,13 @@ public static bool IsNullable(SymbolicRegexMatcher { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader { - Debug.Assert(pos < input.Length, $"input end should not be handled here"); - return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos)); + Debug.Assert(pos < input.Length, "input end should not be handled here"); + Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); + return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, + maxChar, input, pos)); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index 5bcda9cfba731..b8c559135e5e4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -30,7 +30,7 @@ internal static class SymbolicRegexThresholds /// /// Default maximum estimated safe expansion size of a AST - /// after the AST has been anlayzed for safe handling. + /// after the AST has been analyzed for safe handling. /// /// If the AST exceeds this threshold then is thrown. /// This default value may be overridden with the AppContext data diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 94ef063f1c079..bb3f7495f03fe 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck() { // This constant must be at least as large as the one in the implementation that sets the maximum number // of innermost loop iterations between timeout checks. - const int CharsToTriggerTimeoutCheck = 25000; + const int CharsToTriggerTimeoutCheck = 100000; // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger. Assert.Throws(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1)) .Match(new string('a', CharsToTriggerTimeoutCheck))); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index cbddba878edc2..7192b70cec451 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -209,13 +209,13 @@ public static IEnumerable UnsafeThresholdTests_MemberData() [ // simple counters that are too large "((ab){0,9000})", - "((ab){1000})", + "((ab){5000})", "((ab){100,5000})", // almost infinite lower bound "a{2147483646,}", // 2147483646 = int.MaxValue-1 // nested small counters causing unsafe blowup through multiplicative nature of counter nesting - "(((ab){10}){10}){10}", // more than 10^3 - "((((abcd){4}){4}){4}){4}", // exponential: more than 4^5 = 1024 + "(((ab){10}){10}){50}", // more than 10^3 * 5 + "(((((abcd){4}){4}){4}){4}){10}", // exponential: more than 4^5 * 10 = 10240 // combined large counters "((ab){1000}){1000}", // more than 1000^2 "((ab){99999999}){99999999}", // multiply: much more than int.MaxValue From 65c0b8bce3fa583ebf9b47cd3561aab4fe4fc0a2 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 1 Jul 2024 21:53:25 +0300 Subject: [PATCH 38/63] rerunning tests --- .../Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index b8c559135e5e4..9509da2a751d8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -24,7 +24,7 @@ internal static class SymbolicRegexThresholds /// to create a new node and the graph is already or newly beyond this threshold. /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB; /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially - /// significant search-time performance gains + /// significant search-time performance gains. /// internal const int NfaThreshold = 25_000; From de085b46a4ef486add83555696e64f3a43fde858 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 1 Jul 2024 23:05:26 +0300 Subject: [PATCH 39/63] retesting DFA timeout --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index bb3f7495f03fe..7f946adfd2d27 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck() { // This constant must be at least as large as the one in the implementation that sets the maximum number // of innermost loop iterations between timeout checks. - const int CharsToTriggerTimeoutCheck = 100000; + const int CharsToTriggerTimeoutCheck = 200000; // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger. Assert.Throws(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1)) .Match(new string('a', CharsToTriggerTimeoutCheck))); From 5ef3b320ed5cf76b53cf48ae8c231370a95a98bf Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 2 Jul 2024 04:01:38 +0300 Subject: [PATCH 40/63] more precise regex memory limit for DFA mode --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 5 ++-- .../Symbolic/SymbolicRegexMatcher.cs | 25 ++++++++++--------- .../Symbolic/SymbolicRegexThresholds.cs | 17 ++++++------- .../tests/FunctionalTests/Regex.Ctor.Tests.cs | 2 +- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index e5430249a86a8..58807073d944d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -392,8 +392,9 @@ private bool TryCreateNewTransition( { if (// check if there is an active timer (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || - // check if size exceeds the NFA threshold - (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)) + // check if amount of nodes exceeds the NFA threshold + (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold) + ) { nextState = null; return false; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index c95c1802c2a44..de2631aca0149 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.IO; using System.Numerics; +using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -220,7 +221,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder findOptimizations, + _optimizedReversalState.Kind == MatchReversalKind.FixedLength => findOptimizations, // string literals are the best case { FindMode: @@ -573,9 +574,11 @@ private int FindEndPositionOptimized(input, pos - 1)]); int endPos = NoMatchExists; + int lengthMinus1 = input.Length - 1; while (true) { @@ -584,15 +587,14 @@ private int FindEndPositionOptimized dfaCharsPerTimeoutCheck + innerLoopLength = _checkTimeout && lengthMinus1 - pos > dfaCharsPerTimeoutCheck ? pos + dfaCharsPerTimeoutCheck - : input.Length; + : lengthMinus1; done = FindEndPositionDeltasDFAOptimized(input, innerLoopLength - 1, mode, timeoutOccursAt, ref pos, - currentState.DfaStateId, ref endPos, ref initialStatePosCandidate, - ref initialStatePosCandidate); + TOptimizedNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState.DfaStateId, ref endPos); } else { @@ -729,8 +731,7 @@ private int FindEndPositionFallback private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, - long timeoutOccursAt, ref int posRef, int startStateId, ref int endPosRef, ref int initialStatePosRef, - ref int initialStatePosCandidateRef) + long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler @@ -739,7 +740,7 @@ private int FindEndPositionFallback= lengthMinus1) { - if (pos + 1 < input.Length) + if (pos < lengthMinus1) { return false; } @@ -829,7 +830,7 @@ private int FindEndPositionFallback 0 ? initialStatePosCandidateRef : initialStatePosRef; + currentStateIdRef = currStateId; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index 9509da2a751d8..5dcaa31225941 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -8,25 +8,22 @@ namespace System.Text.RegularExpressions.Symbolic /// internal static class SymbolicRegexThresholds { - /// Maximum number of built states before switching over to NFA mode. + /// Maximum number of instances before switching over to NFA mode. /// /// By default, all matching starts out using DFAs, where every state transitions to one and only one /// state for any minterm (each character maps to one minterm). Some regular expressions, however, can result /// in really, really large DFA state graphs, much too big to actually store. Instead of failing when we /// encounter such state graphs, at some point we instead switch from processing as a DFA to processing as - /// an NFA. As an NFA, we instead track all of the states we're in at any given point, and transitioning - /// from one "state" to the next really means for every constituent state that composes our current "state", - /// we find all possible states that transitioning out of each of them could result in, and the union of - /// all of those is our new "state". This constant represents the size of the graph after which we start - /// processing as an NFA instead of as a DFA. This processing doesn't change immediately, however. All - /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex. - /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing - /// to create a new node and the graph is already or newly beyond this threshold. + /// an NFA. As an NFA, we instead track all of the states we're in at any given point. + /// + /// /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB; /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially /// significant search-time performance gains. + /// Worst case memory consumption for the regex instance can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) + /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state. /// - internal const int NfaThreshold = 25_000; + internal const int NfaNodeCountThreshold = 125_000; /// /// Default maximum estimated safe expansion size of a AST diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs index cefad99252342..b9659996a4e51 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs @@ -133,7 +133,7 @@ public static void Ctor_Invalid() Assert.Throws(() => new Regex(@"(?>a*)a", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and atomics Assert.Throws(() => new Regex(@"\Ga", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and start anchors Assert.Throws(() => new Regex(@"(?A)(?<-C>B)$", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and balancing groups - Assert.Throws(() => new Regex(@"\w{1,1001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion + Assert.Throws(() => new Regex(@"\w{1,100001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion } } From 281446fd3775eb5dda439769c770fcb617c0e3ff Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 2 Jul 2024 16:59:04 +0300 Subject: [PATCH 41/63] reverting change --- .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index de2631aca0149..2f06c8e4af85c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -804,7 +804,7 @@ private int FindEndPositionFallback= lengthMinus1) { - if (pos < lengthMinus1) + if (pos + 1 < input.Length) { return false; } From 8f78046589334e7c97851998f4ceecd58b789a46 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 3 Jul 2024 03:52:56 +0300 Subject: [PATCH 42/63] reverting reversal refactor --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 9 +- .../Symbolic/SymbolicRegexMatcher.cs | 86 ++++++++++--------- .../Symbolic/SymbolicRegexThresholds.cs | 7 +- 3 files changed, 57 insertions(+), 45 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 58807073d944d..3f096b97db3b5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -182,7 +182,7 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint /// 2) the reversal starts at abc.*| /// /// reversed initial pattern - /// returns n of chars to skip and adjusted reversal start state + /// returns num of chars to skip and adjusted reversal start state private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node) { int pos = 0; @@ -194,12 +194,14 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node pos = 0; return (false, node); }); + var addSingleton = new Func, (bool, SymbolicRegexNode)>(concatNode => { pos += 1; // continue with next concat return (true, concatNode._right!); }); + var addFixedLengthLoop = new Func, (bool, SymbolicRegexNode)>(concatNode => { SymbolicRegexNode? loopNode = concatNode._left; @@ -207,6 +209,7 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node { return (false, concatNode); } + switch (loopNode!._left!.Kind) { case SymbolicRegexNodeKind.Singleton: @@ -217,9 +220,11 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node // the entire loop is fixed, continue return (true, concatNode._right!); } + // subtract the fixed part of the loop int loopRemainder = loopNode._upper - loopNode._lower; - SymbolicRegexNode newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); + SymbolicRegexNode newLeft = + _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); SymbolicRegexNode newNode = _builder.CreateConcat(newLeft, concatNode._right!); pos += loopNode._lower; return (true, newNode); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 2f06c8e4af85c..89f2ae1fe8336 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1,11 +1,11 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Numerics; -using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -82,7 +82,10 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Data and routines for skipping ahead to the next place a match could potentially start. private readonly RegexFindOptimizations? _findOpts; - /// Dead end state to quickly return NoMatch, this could potentially be a constant + /// + /// Dead end state to quickly return NoMatch. + /// This could potentially be a constant if it's the very first state created + /// private readonly int _deadStateId; /// Initial state used for vectorization @@ -91,7 +94,7 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Whether the pattern contains any anchor private readonly bool _containsAnyAnchor; - /// Whether the pattern contains the EndZ anchor which makes most optimizations invalid + /// Whether the pattern contains the EndZ anchor, which makes most optimization shortcuts invalid private readonly bool _containsEndZAnchor; /// The initial states for the original pattern, keyed off of the previous character kind. @@ -163,11 +166,11 @@ public static SymbolicRegexMatcher Create( // Convert the BDD-based AST to TSet-based AST SymbolicRegexNode rootNode = bddBuilder.Transform(rootBddNode, builder, (builder, bdd) => builder._solver.ConvertFromBDD(bdd, charSetSolver)); - return new SymbolicRegexMatcher(bddBuilder, builder, rootNode, captureCount, findOptimizations, matchTimeout); + return new SymbolicRegexMatcher(builder, rootNode, captureCount, findOptimizations, matchTimeout); } /// Constructs matcher for given symbolic regex. - private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) + private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNode rootNode, int captureCount, RegexFindOptimizations findOptimizations, TimeSpan matchTimeout) { Debug.Assert(builder._solver is UInt64Solver or BitVectorSolver, $"Unsupported solver: {builder._solver}"); @@ -215,13 +218,6 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder _, SymbolicRegexBuilder findOptimizations, // string literals are the best case { FindMode: @@ -230,11 +226,15 @@ FindNextStartingPositionMode.FixedDistanceString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight } => findOptimizations, // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null - { FindMode: FindNextStartingPositionMode.LeadingStrings_LeftToRight } => findOptimizations, - { FindMode: FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight } => findOptimizations, + { + FindMode: + FindNextStartingPositionMode.LeadingStrings_LeftToRight or + FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight, + LeadingStrings: not null + } when findOptimizations.LeadingStrings.GetType().Name != "StringSearchValuesAhoCorasick`2" => findOptimizations, // for singular character sets it depends if there's any reasonably small set to be accelerated - { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => null, - { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => null, + { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when !findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => findOptimizations, + { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when !CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => findOptimizations, _ => null }; } @@ -291,31 +291,30 @@ FindNextStartingPositionMode.FixedDistanceString_LeftToRight or _reverseInitialStates = reverseInitialStates; - // TODO: this is still work in progress - // The frequency of occurrences makes a big difference here, - // anything above 4 uint16 chars is generally slower than DFA, but - // if the characters are very rare, then SearchValues can be up to ~2x faster - // SearchValues implementations to avoid: - // - ProbabilisticCharSearchValues - // - ProbabilisticWithAsciiCharSearchValues`1 - // - AsciiCharSearchValues`1 - // - Any5SearchValues`2" - // SearchValues implementations to avoid: - // - StringSearchValuesAhoCorasick`2 + // Some SearchValues implementations are slower than a DFA, + // but depend on input frequency. + // This is currently tuned for consistency + // but it could return false to enable findOptimizations. bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet) { + char[]? chars = fixedDistanceSet.Chars; + bool avoidSearchValues = false; + if (chars is not null && chars.Length > 5) + { + // RegexFindOptimizations picks 3 sets at most so the construction overhead should not be too high + var searchValues = SearchValues.Create(chars); + avoidSearchValues = searchValues.GetType().Name switch + { + "ProbabilisticCharSearchValues" => true, + "ProbabilisticWithAsciiCharSearchValues`1" => true, + "AsciiCharSearchValues`1" => true, + _ => false + }; + } + return fixedDistanceSet switch { - // anything above 4 uint16 chars is generally slower than DFA - { Chars: not null } => - // negated sets are usually large - fixedDistanceSet.Negated || - (fixedDistanceSet.Chars.Length > 4 - // TODO: this extra condition is currently kept so there's no regressions - // if ~500mb/s worst case is acceptable then this could be removed - // but being able to guess which character sets are not too frequent can - // often reach over 1gb/s with AVX - && Array.Exists(fixedDistanceSet.Chars, char.IsAsciiLetterLower)), + { Chars: not null } => fixedDistanceSet.Negated || avoidSearchValues, { Range: not null } => false, _ => false, }; @@ -429,9 +428,10 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // It returns NoMatchExists (-2) when there is no match. // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after - // the c as the low boundary for the starting position.d - int matchEnd; + // the c as the low boundary for the starting position. + // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases + int matchEnd; if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null)) { matchEnd = (_mintermClassifier.IsFullLookup(), _findOpts is not null, _containsAnyAnchor) switch @@ -568,13 +568,19 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } } + /// + /// This version of uses a different set of interfaces, + /// which don't check for many inner loop edge cases e.g. input end or '\n'. + /// All edge cases are handled before entering the loop. + /// private int FindEndPositionOptimized( ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler { - // TODO: possible this value could be removed + // this initial state candidate is not really used in the common DFA case + // and could potentially be removed in the future int initialStatePosCandidate = pos; var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); int endPos = NoMatchExists; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index 5dcaa31225941..bf7d5a6501699 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -17,10 +17,11 @@ internal static class SymbolicRegexThresholds /// an NFA. As an NFA, we instead track all of the states we're in at any given point. /// /// - /// This limit is chosen due to memory usage constraints, the worst possible allocation is currently approx. 50 MB; + /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance + /// is currently approx. 50 MB. /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially - /// significant search-time performance gains. - /// Worst case memory consumption for the regex instance can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) + /// significant search-time performance gains. Worst case memory consumption for the regex instance + /// can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state. /// internal const int NfaNodeCountThreshold = 125_000; From 715752084280b47cbdfe6cabfb5079a8d100210e Mon Sep 17 00:00:00 2001 From: ieviev <36763595+ieviev@users.noreply.github.com> Date: Wed, 3 Jul 2024 23:49:15 +0300 Subject: [PATCH 43/63] Apply suggestions from code review Co-authored-by: Dan Moseley --- .../RegularExpressions/Symbolic/MintermClassifier.cs | 1 - .../Symbolic/SymbolicRegexMatcher.cs | 11 ++++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 323cc6cdcc316..9e3dad73e5bff 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -28,7 +28,6 @@ internal sealed class MintermClassifier /// private readonly int[]? _intLookup; - /// /// Maximum ordinal character for a non-0 minterm, used to conserve memory /// Note: this is maximum index allowed for the lookup, the array size is _maxChar + 1 diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 89f2ae1fe8336..3ca0ddfd41e06 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -246,7 +246,6 @@ FindNextStartingPositionMode.LeadingStrings_LeftToRight or // The loops below and how character kinds are calculated assume that the "general" character kind is zero Debug.Assert(CharKind.General == 0); - // Assign edge case info for quick lookup _containsAnyAnchor = _pattern._info.ContainsSomeAnchor; _containsEndZAnchor = _pattern._info.ContainsEndZAnchor; @@ -290,7 +289,6 @@ FindNextStartingPositionMode.LeadingStrings_LeftToRight or } _reverseInitialStates = reverseInitialStates; - // Some SearchValues implementations are slower than a DFA, // but depend on input frequency. // This is currently tuned for consistency @@ -592,7 +590,7 @@ private int FindEndPositionOptimized dfaCharsPerTimeoutCheck ? pos + dfaCharsPerTimeoutCheck : lengthMinus1; @@ -606,7 +604,7 @@ private int FindEndPositionOptimized nfaCharsPerTimeoutCheck ? pos + nfaCharsPerTimeoutCheck : input.Length; @@ -678,7 +676,7 @@ private int FindEndPositionFallback dfaCharsPerTimeoutCheck ? pos + dfaCharsPerTimeoutCheck : input.Length; @@ -691,7 +689,7 @@ private int FindEndPositionFallback nfaCharsPerTimeoutCheck ? pos + nfaCharsPerTimeoutCheck : input.Length; @@ -840,7 +838,6 @@ private int FindEndPositionFallback /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, From 931552d4de8dd787a8687fad71c89dc641b47984 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 3 Jul 2024 23:52:09 +0300 Subject: [PATCH 44/63] variable naming --- .../Symbolic/SymbolicRegexMatcher.cs | 16 ++++++++-------- .../tests/FunctionalTests/Regex.Match.Tests.cs | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 3ca0ddfd41e06..8e67be2ce7cf7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -591,8 +591,8 @@ private int FindEndPositionOptimized dfaCharsPerTimeoutCheck - ? pos + dfaCharsPerTimeoutCheck + innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck + ? pos + DfaCharsPerTimeoutCheck : lengthMinus1; done = FindEndPositionDeltasDFAOptimized nfaCharsPerTimeoutCheck - ? pos + nfaCharsPerTimeoutCheck + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck + ? pos + NfaCharsPerTimeoutCheck : input.Length; done = FindEndPositionDeltasNFA dfaCharsPerTimeoutCheck - ? pos + dfaCharsPerTimeoutCheck + innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck + ? pos + DfaCharsPerTimeoutCheck : input.Length; done = FindEndPositionDeltasDFA nfaCharsPerTimeoutCheck - ? pos + nfaCharsPerTimeoutCheck + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck + ? pos + NfaCharsPerTimeoutCheck : input.Length; done = FindEndPositionDeltasNFA(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1)) .Match(new string('a', CharsToTriggerTimeoutCheck))); From cc493f13679e21f3662b33ce5018d7e29e644408 Mon Sep 17 00:00:00 2001 From: ieviev Date: Wed, 3 Jul 2024 23:54:26 +0300 Subject: [PATCH 45/63] test for over 255 minterms --- .../FunctionalTests/Regex.Match.Tests.cs | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index af5adb063bfb7..b96d5459f6e8b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2653,5 +2653,28 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } }; } } + + [Fact] + public async Task MatchNonBacktrackingOver255Minterms() + { + // This is a test for the rare over 255 unique minterms case in MintermClassifier + StringBuilder pattern = new(); + StringBuilder input = new(); + for (int i = 256; i <= 768; i++) + { + string str = new Rune(i).ToString(); + pattern.Append(str); + // adding an optional char as well just so it's not a string literal + pattern.Append(str); + pattern.Append('?'); + // input is the pattern itself + input.Append(str); + } + Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, pattern.ToString(), RegexOptions.None); + MatchCollection ms = r.Matches(input.ToString()); + Assert.Equal(1, ms.Count); + Assert.Equal(0, ms[0].Index); + Assert.Equal(513, ms[0].Length); + } } } From a0d239064b276123b4e3726e12eddf34dcf007d9 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 00:22:50 +0300 Subject: [PATCH 46/63] adding net directive around test --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index b96d5459f6e8b..51b3d926e66fd 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2653,7 +2653,7 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } }; } } - +#if NET [Fact] public async Task MatchNonBacktrackingOver255Minterms() { @@ -2676,5 +2676,6 @@ public async Task MatchNonBacktrackingOver255Minterms() Assert.Equal(0, ms[0].Index); Assert.Equal(513, ms[0].Length); } +#endif } } From 0691c5894326a2e25b757eb06640c8c9eece3736 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 00:34:17 +0300 Subject: [PATCH 47/63] all engines in minterms test --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 51b3d926e66fd..993879441bb28 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2670,11 +2670,19 @@ public async Task MatchNonBacktrackingOver255Minterms() // input is the pattern itself input.Append(str); } - Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, pattern.ToString(), RegexOptions.None); - MatchCollection ms = r.Matches(input.ToString()); - Assert.Equal(1, ms.Count); - Assert.Equal(0, ms[0].Index); - Assert.Equal(513, ms[0].Length); + + // just so it's not allocated multiple times + string patternString = pattern.ToString(); + string inputString = input.ToString(); + + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); + MatchCollection ms = r.Matches(inputString); + Assert.Equal(1, ms.Count); + Assert.Equal(0, ms[0].Index); + Assert.Equal(513, ms[0].Length); + } } #endif } From 8ceb20767ee48052122db04dfce3c1ca1e645476 Mon Sep 17 00:00:00 2001 From: ieviev <36763595+ieviev@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:50:59 +0300 Subject: [PATCH 48/63] Apply suggestions from code review Co-authored-by: Stephen Toub --- .../Symbolic/SymbolicRegexMatcher.cs | 91 +++++++++---------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 8e67be2ce7cf7..4f1f13ef9fc60 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -510,12 +510,13 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { i -= _optimizedReversalState.FixedLength; reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!); + // reversal may already be nullable here in the case of anchors if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0) { if (FullNullabilityHandler.IsNullableAt(this, - in reversalStartState, FullInputReader.GetPositionId(this, input, i), - DfaStateHandler.GetStateFlags(this, in reversalStartState))) + in reversalStartState, FullInputReader.GetPositionId(this, input, i), + DfaStateHandler.GetStateFlags(this, in reversalStartState))) { initialLastStart = i; } @@ -523,8 +524,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - reversalStartState = new CurrentState(_reverseInitialStates[ - GetCharKind(input, matchEnd)]); + reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); + } matchStart = matchEnd < startat ? startat @@ -567,9 +568,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } /// - /// This version of uses a different set of interfaces, - /// which don't check for many inner loop edge cases e.g. input end or '\n'. - /// All edge cases are handled before entering the loop. + /// Streamlined version of that doesn't handle /z anchors or very large sets of minterms. /// private int FindEndPositionOptimized( ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) @@ -590,7 +589,7 @@ private int FindEndPositionOptimized DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1; @@ -603,7 +602,6 @@ private int FindEndPositionOptimized NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck @@ -772,10 +770,8 @@ private int FindEndPositionFallback( - this, mtlookup, input, ref currStateId, ref pos, initialStateId)) + this, mtlookup, input, ref currStateId, ref pos, initialStateId)) { - // a good potential future optimization here would - // be to combine this with an immediate state transition if (pos == input.Length) { // patterns such as ^$ can be nullable right away @@ -784,16 +780,18 @@ private int FindEndPositionFallback(this, _nullabilityArray, currStateId, mtlookup, - maxChar, input, pos)) + if (TOptimizedNullabilityHandler.IsNullable( + this, _nullabilityArray, currStateId, mtlookup, maxChar, input, pos)) { endPos = pos; + // A match is known to exist. If that's all we need to know, we're done. if (mode == RegexRunnerMode.ExistenceRequired) { @@ -804,27 +802,28 @@ private int FindEndPositionFallback= lengthMinus1) + this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos), timeoutOccursAt) || + pos >= lengthMinus1) { if (pos + 1 < input.Length) { return false; } pos++; + // one off check for the final position // this is just to move it out of the hot loop if (!(_stateFlagsArray[currStateId].IsNullable() || - _stateArray[currStateId]!.IsNullableFor( - GetPositionKind(-1)))) + _stateArray[currStateId]!.IsNullableFor(GetPositionKind(-1)))) { return true; + } // the end position (-1) was nullable endPos = pos; return true; } + // We successfully transitioned, so update our current input index to match. pos++; } @@ -856,8 +855,7 @@ private int FindEndPositionFallback private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef - ) + long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler @@ -879,6 +877,7 @@ private bool FindEndPositionDeltasDFA(this, input, ref state, ref pos)) @@ -893,9 +892,10 @@ private bool FindEndPositionDeltasDFA(this, in state, - positionId, TStateHandler.GetStateFlags(this, in state))) + positionId, TStateHandler.GetStateFlags(this, in state))) { endPos = pos; + // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); initialStatePos = initialStatePosCandidate; @@ -907,8 +907,7 @@ private bool FindEndPositionDeltasDFA= length || !TStateHandler.TryTakeTransition(this, ref state, - positionId, timeoutOccursAt)) + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt)) { return false; } @@ -922,7 +921,6 @@ private bool FindEndPositionDeltasDFA 0 && TNullabilityHandler.IsNullableAt(this, in state, positionId, - TStateHandler.GetStateFlags(this, in state))) + if (_nullabilityArray[state.DfaStateId] > 0 && + TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { lastStart = pos; } @@ -1128,8 +1127,7 @@ private bool FindStartPositionDeltasNFA(this, in state, positionId, - TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { lastStart = pos; } @@ -1473,7 +1471,8 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, int mintermId, long timeoutOccursAt) { - Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}."); + Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0"); + // Use the mintermId for the character being read to look up which state to transition to. // If that state has already been materialized, move to it, and we're done. If that state // hasn't been materialized, try to create it; if we can, move to it, and we're done. @@ -1487,14 +1486,15 @@ public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref } if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId, - matcher.DeltaOffset(state, mintermId), - checkThreshold: true, out MatchingState? nextState, timeoutOccursAt)) + matcher.DeltaOffset(state, mintermId), + checkThreshold: true, out MatchingState? nextState, timeoutOccursAt)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. state = nextState.Id; return true; } + return false; } @@ -1703,8 +1703,7 @@ public static void UndoTransition(ref CurrentState state) /// private interface IOptimizedInputReader { - public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, - int pos); + public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos); } /// @@ -1717,7 +1716,8 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan i { Debug.Assert(pos < input.Length, "pos < input.Length"); Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}"); - return input[pos] > maxChar ? 0 : lookup[input[pos]]; + char c = input[pos]; + return (uint)c < (uint)lookup.Length ? lookup[c] : 0; } } @@ -1742,8 +1742,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan i private interface IOptimizedNullabilityHandler { public static abstract bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int - currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) + byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) where TOptimizedInputReader : struct, IOptimizedInputReader; } @@ -1755,8 +1754,7 @@ public static bool IsNullable(SymbolicRegexMatcher where TOptimizedInputReader : struct, IOptimizedInputReader { Debug.Assert(pos < input.Length, "input end should not be handled here"); - Debug.Assert(currStateId < nullabilityArray.Length, - "nullabilityArray grown but the reference is not up to date"); + Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); return nullabilityArray[currStateId] > 0; } } @@ -1770,8 +1768,9 @@ public static bool IsNullable(SymbolicRegexMatcher { Debug.Assert(pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); - return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, - maxChar, input, pos)); + return + nullabilityArray[currStateId] > 0 && + matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, maxChar, input, pos)); } } @@ -1827,8 +1826,8 @@ public static abstract bool TryFindNextStartingPosition(SymbolicRe private interface IAcceleratedStateHandler { public static abstract bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, ref - int currentStateId, ref int pos, int initialStateId) + SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, + ref int currentStateId, ref int pos, int initialStateId) where TOptimizedInputReader : struct, IOptimizedInputReader; } @@ -1836,13 +1835,13 @@ public static abstract bool TryFindNextStartingPosition( { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - byte[] lookup, - ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) + byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) where TOptimizedInputReader : struct, IOptimizedInputReader - { if (currentStateId != initialStateId) + { return false; + } if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { From 379519b032a327c2bf39fc601abf00de365062a5 Mon Sep 17 00:00:00 2001 From: ieviev <36763595+ieviev@users.noreply.github.com> Date: Thu, 4 Jul 2024 02:02:07 +0300 Subject: [PATCH 49/63] Apply suggestions from code review Co-authored-by: Stephen Toub --- .../Symbolic/MintermClassifier.cs | 2 +- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 43 +++++++++---------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 9e3dad73e5bff..b61a2f2cc96fb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -137,6 +137,6 @@ public int GetMintermID(int c) /// Maximum ordinal character for a non-0 minterm, used to conserve memory /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int MaxChar() => _maxChar; + public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 3f096b97db3b5..c97b5e15b6403 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -188,6 +188,7 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node int pos = 0; SymbolicRegexNode? current = node; bool canLoop = true; + // finding anchors inside pattern invalidates this optimization var bail = new Func, (bool, SymbolicRegexNode)>(concatNode => { @@ -216,18 +217,18 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node if (loopNode._lower == loopNode._upper) { - pos += loopNode._lower; // the entire loop is fixed, continue + pos += loopNode._lower; return (true, concatNode._right!); } // subtract the fixed part of the loop int loopRemainder = loopNode._upper - loopNode._lower; - SymbolicRegexNode newLeft = - _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); + SymbolicRegexNode newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); SymbolicRegexNode newNode = _builder.CreateConcat(newLeft, concatNode._right!); pos += loopNode._lower; return (true, newNode); + default: return (false, concatNode); } @@ -237,36 +238,32 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node { (bool loop, SymbolicRegexNode next) = current switch { - // This could potentially be a very good future optimization for + // Bail if it contains any anchors. (This could potentially be a very good future optimization for // anchors but there's too many edge cases to guarantee it works. - // one example which fails currently: pattern: @"\By\b", input: "xy" + // one example which fails currently: pattern: @"\By\b", input: "xy") { _info.ContainsSomeAnchor: true } => bail(current), + // if this is reached then entire match is fixed length { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), - {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd} => - (true, current._right!), - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => - (true, current._right!), - {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => - addSingleton(current), - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => - addFixedLengthLoop(current), + + { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!), + + {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!), + + {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => addSingleton(current), + + {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => addFixedLengthLoop(current), + _ => (false, current) }; canLoop = loop; current = next; } - MatchReversal reversal = - (pos, current) switch - { - { pos: > 0 } when current == _builder.Epsilon => new MatchReversal(MatchReversalKind.FixedLength, pos), - { pos: > 0 } => new MatchReversal(MatchReversalKind.PartialFixedLength, pos, - GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)), - _ => new MatchReversal(MatchReversalKind.MatchStart, 0) - }; - - return reversal; + return + pos <= 0 ? new MatchReversal(MatchReversalKind.MatchStart, 0) : + current == _builder.Epsilon ? new MatchReversal(MatchReversalKind.FixedLength, pos) : + new MatchReversal(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); } /// From 57c8f6d41385de279752413743d3824da79a155b Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 02:37:18 +0300 Subject: [PATCH 50/63] simplifying code --- .../Symbolic/MintermClassifier.cs | 15 -- .../Symbolic/SymbolicRegexMatcher.cs | 154 ++++-------------- .../tests/UnitTests/SymbolicRegexTests.cs | 14 -- 3 files changed, 31 insertions(+), 152 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index b61a2f2cc96fb..41a6c9b007593 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -54,13 +54,6 @@ public MintermClassifier(BDD[] minterms) { _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2); } - // there is an opportunity to gain around 5% performance for allocating the - // full 64K, past a certain threshold where maxChar is already large. - // TODO: what should this threshold be? - if (_maxChar > 32_000) - { - _maxChar = ushort.MaxValue; - } // It's incredibly rare for a regex to use more than a hundred or two minterms, // but we need a fallback just in case. @@ -125,14 +118,6 @@ public int GetMintermID(int c) [MethodImpl(MethodImplOptions.AggressiveInlining)] public int[]? IntLookup() => _intLookup; - /// - /// Whether the full 64K char lookup is allocated. - /// This accelerates the minterm mapping by removing an if-else case, - /// and is only considered for the common <= 255 minterms case - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool IsFullLookup() => _lookup is not null && _lookup.Length == ushort.MaxValue + 1; - /// /// Maximum ordinal character for a non-0 minterm, used to conserve memory /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 4f1f13ef9fc60..d7582f2e27c2f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -110,7 +110,7 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher private readonly MatchingState[] _reverseInitialStates; /// - /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state. + /// Reversal state which skips fixed length parts. /// private readonly MatchReversal _optimizedReversalState; @@ -215,28 +215,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo if (findOptimizations.IsUseful && findOptimizations.LeadingAnchor is not RegexNodeKind.Beginning) { - // In some cases where the findOptimizations are useful, just using the DFA can still be faster. - _findOpts = findOptimizations switch - { - // string literals are the best case - { - FindMode: - FindNextStartingPositionMode.LeadingString_LeftToRight or - FindNextStartingPositionMode.FixedDistanceString_LeftToRight or - FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight - } => findOptimizations, - // note: only the Teddy implementation is faster than DFA here, Aho Corasick should map to null - { - FindMode: - FindNextStartingPositionMode.LeadingStrings_LeftToRight or - FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight, - LeadingStrings: not null - } when findOptimizations.LeadingStrings.GetType().Name != "StringSearchValuesAhoCorasick`2" => findOptimizations, - // for singular character sets it depends if there's any reasonably small set to be accelerated - { FindMode: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight } when !findOptimizations.FixedDistanceSets!.TrueForAll(CharSetIsTooCommon) => findOptimizations, - { FindMode: FindNextStartingPositionMode.LeadingSet_LeftToRight } when !CharSetIsTooCommon(findOptimizations.FixedDistanceSets![0]) => findOptimizations, - _ => null - }; + _findOpts = findOptimizations; } // Determine the number of initial states. If there's no anchor, only the default previous @@ -289,35 +268,6 @@ FindNextStartingPositionMode.LeadingStrings_LeftToRight or } _reverseInitialStates = reverseInitialStates; - // Some SearchValues implementations are slower than a DFA, - // but depend on input frequency. - // This is currently tuned for consistency - // but it could return false to enable findOptimizations. - bool CharSetIsTooCommon(RegexFindOptimizations.FixedDistanceSet fixedDistanceSet) - { - char[]? chars = fixedDistanceSet.Chars; - bool avoidSearchValues = false; - if (chars is not null && chars.Length > 5) - { - // RegexFindOptimizations picks 3 sets at most so the construction overhead should not be too high - var searchValues = SearchValues.Create(chars); - avoidSearchValues = searchValues.GetType().Name switch - { - "ProbabilisticCharSearchValues" => true, - "ProbabilisticWithAsciiCharSearchValues`1" => true, - "AsciiCharSearchValues`1" => true, - _ => false - }; - } - - return fixedDistanceSet switch - { - { Chars: not null } => fixedDistanceSet.Negated || avoidSearchValues, - { Range: not null } => false, - _ => false, - }; - } - // Maps a minterm ID to a character kind uint CalculateMintermIdKind(int mintermId) { @@ -432,32 +382,20 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i int matchEnd; if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null)) { - matchEnd = (_mintermClassifier.IsFullLookup(), _findOpts is not null, _containsAnyAnchor) switch + matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { - (false, true, true) => + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true, false) => + (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false, false) => + (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false, true) => + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, true, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, true, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else @@ -572,7 +510,6 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// private int FindEndPositionOptimized( ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) - where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler { @@ -594,7 +531,7 @@ private int FindEndPositionOptimized(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState.DfaStateId, ref endPos); @@ -731,10 +668,9 @@ private int FindEndPositionFallback - private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) - where TOptimizedInputReader : struct, IOptimizedInputReader where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler { @@ -769,7 +705,7 @@ private int FindEndPositionFallback( + if (TAcceleratedStateHandler.TryFindNextStartingPosition( this, mtlookup, input, ref currStateId, ref pos, initialStateId)) { if (pos == input.Length) @@ -787,7 +723,7 @@ private int FindEndPositionFallback( + if (TOptimizedNullabilityHandler.IsNullable( this, _nullabilityArray, currStateId, mtlookup, maxChar, input, pos)) { endPos = pos; @@ -802,7 +738,7 @@ private int FindEndPositionFallback= lengthMinus1) { if (pos + 1 < input.Length) @@ -1694,22 +1630,12 @@ public static void UndoTransition(ref CurrentState state) #endif } - /// - /// This input reader attempts to minimize overhead - /// by handling constraints outside of the loop: - /// 1. the position must be already valid for the input. - /// 2. the pattern must not contain \Z. - /// 3. to save memory, `maxChar` is a local variable set to the ordinal char for highest non-0 minterm - /// - private interface IOptimizedInputReader - { - public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos); - } - /// - /// This reader maps all characters > maxChar to 0 - /// - private readonly struct OptimizedSmallInputReader : IOptimizedInputReader + + // /// + // /// This reader maps all characters > maxChar to 0 + // /// + private readonly struct OptimizedSmallInputReader { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos) @@ -1721,37 +1647,22 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan i } } - /// - /// This reader is effectively an array lookup for the all utf16 code units - /// - private readonly struct OptimizedFullInputReader : IOptimizedInputReader - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - { - Debug.Assert(pos < input.Length); - Debug.Assert(lookup.Length == (ushort.MaxValue + 1)); - return lookup[input[pos]]; - } - } - /// /// This nullability handler interface can be used in DFAs /// for patterns that do not contain \Z /// private interface IOptimizedNullabilityHandler { - public static abstract bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - where TOptimizedInputReader : struct, IOptimizedInputReader; + public static abstract bool IsNullable(SymbolicRegexMatcher matcher, + byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, + int pos); } private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - where TOptimizedInputReader : struct, IOptimizedInputReader { Debug.Assert(pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); @@ -1762,15 +1673,15 @@ public static bool IsNullable(SymbolicRegexMatcher private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - where TOptimizedInputReader : struct, IOptimizedInputReader { Debug.Assert(pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); return nullabilityArray[currStateId] > 0 && - matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, maxChar, input, pos)); + matcher.IsNullableWithContext(currStateId, + input[pos] < (uint)lookup.Length ? lookup[input[pos]] : 0); } } @@ -1825,18 +1736,16 @@ public static abstract bool TryFindNextStartingPosition(SymbolicRe /// private interface IAcceleratedStateHandler { - public static abstract bool TryFindNextStartingPosition( + public static abstract bool TryFindNextStartingPosition( SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, - ref int currentStateId, ref int pos, int initialStateId) - where TOptimizedInputReader : struct, IOptimizedInputReader; + ref int currentStateId, ref int pos, int initialStateId); } private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) - where TOptimizedInputReader : struct, IOptimizedInputReader { if (currentStateId != initialStateId) { @@ -1857,10 +1766,9 @@ public static bool TryFindNextStartingPosition(SymbolicRe private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) - where TOptimizedInputReader : struct, IOptimizedInputReader { if (currentStateId != initialStateId) return false; @@ -1868,7 +1776,8 @@ public static bool TryFindNextStartingPosition(SymbolicRe if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { currentStateId = matcher._dotstarredInitialStates[ - matcher._positionKinds[TOptimizedInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1] + matcher._positionKinds[ + OptimizedSmallInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1] ].Id; return true; } @@ -1883,10 +1792,9 @@ public static bool TryFindNextStartingPosition(SymbolicRe private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) - where TOptimizedInputReader : struct, IOptimizedInputReader { return false; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index 7192b70cec451..c14e5e366e53b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -253,19 +253,5 @@ public void SafeThresholdConfigTest(object? newThresholdData, int expectedThresh AppContext.SetData(SymbolicRegexThresholds.SymbolicRegexSafeSizeThreshold_ConfigKeyName, null); Assert.Equal(expectedThreshold, k); } - - [Fact] - public static void OptimizedReversalTests() - { - var charSetSolver = new CharSetSolver(); - var bddBuilder = new SymbolicRegexBuilder(charSetSolver, charSetSolver); - var converter = new RegexNodeConverter(bddBuilder, null); - const RegexOptions options = RegexOptions.NonBacktracking | RegexOptions.ExplicitCapture; - RegexNode tree = RegexParser.Parse("abc.*def", options, CultureInfo.CurrentCulture).Root; - SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(tree); - // todo: import the matcher here or use something else? - // var matcher = SymbolicRegexMatcher.Create(bddBuilder, rootNode, 0, null, TimeSpan.MaxValue); - - } } } From 2e57d428f0aa699805cb88290f0d44fb647c8da8 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 02:39:11 +0300 Subject: [PATCH 51/63] state flag values down --- .../System/Text/RegularExpressions/Symbolic/StateFlags.cs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs index a342aff09b6b8..b446fecdca28f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs @@ -1,8 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Runtime.CompilerServices; - namespace System.Text.RegularExpressions.Symbolic { /// @@ -17,9 +15,9 @@ internal enum StateFlags : byte { None = 0, IsInitialFlag = 1, - IsNullableFlag = 4, - CanBeNullableFlag = 8, - SimulatesBacktrackingFlag = 16, + IsNullableFlag = 2, + CanBeNullableFlag = 4, + SimulatesBacktrackingFlag = 8, } /// From 60b1352f204f1726874b8f2558c2fe54272a0bf3 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 02:45:50 +0300 Subject: [PATCH 52/63] mintermclassifier changes --- .../Symbolic/MintermClassifier.cs | 22 ++++++++----------- .../Symbolic/SymbolicRegexMatcher.cs | 2 +- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 41a6c9b007593..7a1af1fb5496b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -28,27 +28,20 @@ internal sealed class MintermClassifier /// private readonly int[]? _intLookup; - /// - /// Maximum ordinal character for a non-0 minterm, used to conserve memory - /// Note: this is maximum index allowed for the lookup, the array size is _maxChar + 1 - /// - private readonly int _maxChar; - /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. public MintermClassifier(BDD[] minterms) { Debug.Assert(minterms.Length > 0, "Requires at least"); - if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). _lookup = Array.Empty(); - _maxChar = -1; return; } + int _maxChar = -1; // attempt to save memory in common cases by allocating only up to the highest char code for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { @@ -96,13 +89,16 @@ public MintermClassifier(BDD[] minterms) [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { - if (c > _maxChar) + if (_intLookup is null) { - return 0; + byte[] lookup = _lookup!; + return (uint)c < (uint)lookup.Length ? lookup[c] : 0; + } + else + { + int[] lookup = _intLookup!; + return (uint)c < (uint)lookup.Length ? lookup[c] : 0; } - - // high performance inner-loop variant uses the array directly - return _intLookup is null ? _lookup![c] : _intLookup[c]; } /// /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index d7582f2e27c2f..3a7b331125002 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1404,7 +1404,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur /// Transition function that only considers DFA state id [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, + internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, int mintermId, long timeoutOccursAt) { Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0"); From 2900aadc6478ddb792d4e0b397fc8cabd69c1877 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 03:49:48 +0300 Subject: [PATCH 53/63] reversal --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 84 ++++++++++--------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index c97b5e15b6403..524073c11959b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -186,24 +186,57 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node) { int pos = 0; - SymbolicRegexNode? current = node; + SymbolicRegexNode current = node; bool canLoop = true; + while (canLoop) + { + (bool loop, SymbolicRegexNode next) = current switch + { + // Bail if it contains any anchors. (This could potentially be a very good future optimization for + // anchors but there's too many edge cases to guarantee it works. + // one example which fails currently: pattern: @"\By\b", input: "xy") + { _info.ContainsSomeAnchor: true } => Bail(), + + // if this is reached then entire match is fixed length + { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), + + { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!), + + {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!), + + {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => AddSingleton(current), + + {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => + AddFixedLengthLoop(current), + + _ => (false, current) + }; + canLoop = loop; + current = next; + } + + return + pos <= 0 ? new MatchReversal(MatchReversalKind.MatchStart, 0) : + current == _builder.Epsilon ? new MatchReversal(MatchReversalKind.FixedLength, pos) : + new MatchReversal(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); + // finding anchors inside pattern invalidates this optimization - var bail = new Func, (bool, SymbolicRegexNode)>(concatNode => + (bool, SymbolicRegexNode) Bail() { - pos = 0; + pos += 1; + // continue with next concat return (false, node); - }); + } - var addSingleton = new Func, (bool, SymbolicRegexNode)>(concatNode => + (bool, SymbolicRegexNode) AddSingleton(SymbolicRegexNode concatNode) { pos += 1; // continue with next concat return (true, concatNode._right!); - }); + } - var addFixedLengthLoop = new Func, (bool, SymbolicRegexNode)>(concatNode => + (bool, SymbolicRegexNode) AddFixedLengthLoop(SymbolicRegexNode concatNode) { SymbolicRegexNode? loopNode = concatNode._left; if (loopNode is { _lower: <= 0 }) @@ -217,53 +250,22 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node if (loopNode._lower == loopNode._upper) { - // the entire loop is fixed, continue pos += loopNode._lower; + // the entire loop is fixed, continue return (true, concatNode._right!); } // subtract the fixed part of the loop int loopRemainder = loopNode._upper - loopNode._lower; - SymbolicRegexNode newLeft = _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); + SymbolicRegexNode newLeft = + _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); SymbolicRegexNode newNode = _builder.CreateConcat(newLeft, concatNode._right!); pos += loopNode._lower; return (true, newNode); - default: return (false, concatNode); } - }); - - while (canLoop) - { - (bool loop, SymbolicRegexNode next) = current switch - { - // Bail if it contains any anchors. (This could potentially be a very good future optimization for - // anchors but there's too many edge cases to guarantee it works. - // one example which fails currently: pattern: @"\By\b", input: "xy") - { _info.ContainsSomeAnchor: true } => bail(current), - - // if this is reached then entire match is fixed length - { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), - - { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!), - - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!), - - {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => addSingleton(current), - - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => addFixedLengthLoop(current), - - _ => (false, current) - }; - canLoop = loop; - current = next; } - - return - pos <= 0 ? new MatchReversal(MatchReversalKind.MatchStart, 0) : - current == _builder.Epsilon ? new MatchReversal(MatchReversalKind.FixedLength, pos) : - new MatchReversal(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); } /// From 764ded8d4c35ba0b6d2beeee17a62bcd13e4b621 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 03:55:14 +0300 Subject: [PATCH 54/63] getstateflags --- .../Symbolic/SymbolicRegexMatcher.cs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 3a7b331125002..1d547b69239d1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1595,18 +1595,16 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher< public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) { SparseIntMap stateSet = state.NfaState!.NfaStateSet; + // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then + // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if + // they are true for any state in the set; SimulatesBacktracking is true for all the states if + // it is true for any state (since it is a phase-wide property); and all other flags are masked out. + StateFlags flags = 0; + foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(stateSet.Values)) { - // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then - // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if - // they are true for any state in the set; SimulatesBacktracking is true for all the states if - // it is true for any state (since it is a phase-wide property); and all other flags are masked out. - StateFlags flags = 0; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(stateSet.Values)) - { - flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)]; - } - return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag); + flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)]; } + return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag); } #if DEBUG From 81d0dca2be560ee310f9fbc4e24228cbdd86a7be Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 03:56:17 +0300 Subject: [PATCH 55/63] formatting --- .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 1d547b69239d1..27646a170bb5b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -466,7 +466,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } matchStart = matchEnd < startat - ? startat + ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch { (true, true) => From 38f28b9ada0504ac8e6504ce1876496b19847011 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 04:33:23 +0300 Subject: [PATCH 56/63] removing unused interface --- .../Symbolic/SymbolicRegexMatcher.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 27646a170bb5b..b80314c742840 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -385,16 +385,16 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { (true, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), (true, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), (false, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), (false, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } @@ -508,7 +508,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// /// Streamlined version of that doesn't handle /z anchors or very large sets of minterms. /// - private int FindEndPositionOptimized( + private int FindEndPositionOptimized( ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler @@ -1641,7 +1641,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan i Debug.Assert(pos < input.Length, "pos < input.Length"); Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}"); char c = input[pos]; - return (uint)c < (uint)lookup.Length ? lookup[c] : 0; + return c < (uint)lookup.Length ? lookup[c] : 0; } } From cce11887f7ba39685851752ae78bf17ab35fc950 Mon Sep 17 00:00:00 2001 From: ieviev Date: Thu, 4 Jul 2024 16:27:42 +0300 Subject: [PATCH 57/63] local function typo --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 524073c11959b..306704994c3de 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -216,6 +216,7 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node current = next; } + return pos <= 0 ? new MatchReversal(MatchReversalKind.MatchStart, 0) : current == _builder.Epsilon ? new MatchReversal(MatchReversalKind.FixedLength, pos) : @@ -224,8 +225,8 @@ private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node // finding anchors inside pattern invalidates this optimization (bool, SymbolicRegexNode) Bail() { - pos += 1; - // continue with next concat + pos = 0; + // return original node return (false, node); } From 8b946da4b2e95e26604a358e369e91b5359c4025 Mon Sep 17 00:00:00 2001 From: ieviev Date: Fri, 5 Jul 2024 16:42:27 +0300 Subject: [PATCH 58/63] temporarily removing minterms test --- .../FunctionalTests/Regex.Match.Tests.cs | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 993879441bb28..e272942632aa7 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2654,36 +2654,36 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() } } #if NET - [Fact] - public async Task MatchNonBacktrackingOver255Minterms() - { - // This is a test for the rare over 255 unique minterms case in MintermClassifier - StringBuilder pattern = new(); - StringBuilder input = new(); - for (int i = 256; i <= 768; i++) - { - string str = new Rune(i).ToString(); - pattern.Append(str); - // adding an optional char as well just so it's not a string literal - pattern.Append(str); - pattern.Append('?'); - // input is the pattern itself - input.Append(str); - } - - // just so it's not allocated multiple times - string patternString = pattern.ToString(); - string inputString = input.ToString(); - - foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - { - Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); - MatchCollection ms = r.Matches(inputString); - Assert.Equal(1, ms.Count); - Assert.Equal(0, ms[0].Index); - Assert.Equal(513, ms[0].Length); - } - } + // [Fact] + // public async Task MatchNonBacktrackingOver255Minterms() + // { + // // This is a test for the rare over 255 unique minterms case in MintermClassifier + // StringBuilder pattern = new(); + // StringBuilder input = new(); + // for (int i = 256; i <= 768; i++) + // { + // string str = new Rune(i).ToString(); + // pattern.Append(str); + // // adding an optional char as well just so it's not a string literal + // pattern.Append(str); + // pattern.Append('?'); + // // input is the pattern itself + // input.Append(str); + // } + // + // // just so it's not allocated multiple times + // string patternString = pattern.ToString(); + // string inputString = input.ToString(); + // + // foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + // { + // Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); + // MatchCollection ms = r.Matches(inputString); + // Assert.Equal(1, ms.Count); + // Assert.Equal(0, ms[0].Index); + // Assert.Equal(513, ms[0].Length); + // } + // } #endif } } From d3430b3d4be400e56d46e7e6ec3714a4c7dd797a Mon Sep 17 00:00:00 2001 From: ieviev Date: Sat, 6 Jul 2024 11:07:30 +0300 Subject: [PATCH 59/63] re-adding minterms test --- .../FunctionalTests/Regex.Match.Tests.cs | 59 +++++++++---------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index e272942632aa7..cdfa9e5d33113 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2654,36 +2654,35 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() } } #if NET - // [Fact] - // public async Task MatchNonBacktrackingOver255Minterms() - // { - // // This is a test for the rare over 255 unique minterms case in MintermClassifier - // StringBuilder pattern = new(); - // StringBuilder input = new(); - // for (int i = 256; i <= 768; i++) - // { - // string str = new Rune(i).ToString(); - // pattern.Append(str); - // // adding an optional char as well just so it's not a string literal - // pattern.Append(str); - // pattern.Append('?'); - // // input is the pattern itself - // input.Append(str); - // } - // - // // just so it's not allocated multiple times - // string patternString = pattern.ToString(); - // string inputString = input.ToString(); - // - // foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - // { - // Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); - // MatchCollection ms = r.Matches(inputString); - // Assert.Equal(1, ms.Count); - // Assert.Equal(0, ms[0].Index); - // Assert.Equal(513, ms[0].Length); - // } - // } + [Fact] + public async Task MatchNonBacktrackingOver255Minterms() + { + // This is a test for the rare over 255 unique minterms case in MintermClassifier + StringBuilder pattern = new(); + StringBuilder input = new(); + for (int i = 128; i <= 500; i++) + { + char c = (char)i; + pattern.Append(c); + // adding an optional char as well just so it's not a string literal + pattern.Append(c); + pattern.Append('?'); + // input is the pattern itself + input.Append(c); + } + + string patternString = pattern.ToString(); + string inputString = input.ToString(); + + // foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + // { + Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None); + MatchCollection ms = r.Matches(inputString); + Assert.Equal(1, ms.Count); + Assert.Equal(0, ms[0].Index); + Assert.Equal(373, ms[0].Length); + // } + } #endif } } From 388c256331a91fb4c787fc7c4213788b06c5db1d Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 8 Jul 2024 22:34:05 +0300 Subject: [PATCH 60/63] reenabling test for all engines --- .../FunctionalTests/Regex.Match.Tests.cs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index cdfa9e5d33113..2231062b5af57 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2670,18 +2670,18 @@ public async Task MatchNonBacktrackingOver255Minterms() // input is the pattern itself input.Append(c); } - + string patternString = pattern.ToString(); string inputString = input.ToString(); - - // foreach (RegexEngine engine in RegexHelpers.AvailableEngines) - // { - Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None); - MatchCollection ms = r.Matches(inputString); - Assert.Equal(1, ms.Count); - Assert.Equal(0, ms[0].Index); - Assert.Equal(373, ms[0].Length); - // } + + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None); + MatchCollection ms = r.Matches(inputString); + Assert.Equal(1, ms.Count); + Assert.Equal(0, ms[0].Index); + Assert.Equal(373, ms[0].Length); + } } #endif } From 270464102417572fff83a049b07d2919e687aca1 Mon Sep 17 00:00:00 2001 From: ieviev Date: Mon, 8 Jul 2024 22:59:33 +0300 Subject: [PATCH 61/63] test bugfix --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 2231062b5af57..9e6db7976433b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2660,7 +2660,7 @@ public async Task MatchNonBacktrackingOver255Minterms() // This is a test for the rare over 255 unique minterms case in MintermClassifier StringBuilder pattern = new(); StringBuilder input = new(); - for (int i = 128; i <= 500; i++) + for (int i = 128; i <= 400; i++) { char c = (char)i; pattern.Append(c); @@ -2676,7 +2676,7 @@ public async Task MatchNonBacktrackingOver255Minterms() foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - Regex r = await RegexHelpers.GetRegexAsync(RegexEngine.NonBacktracking, patternString, RegexOptions.None); + Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); MatchCollection ms = r.Matches(inputString); Assert.Equal(1, ms.Count); Assert.Equal(0, ms[0].Index); From 0abaabee87b9e9dcce6c55ae3901a0fa79ecc88d Mon Sep 17 00:00:00 2001 From: ieviev Date: Tue, 9 Jul 2024 00:00:04 +0300 Subject: [PATCH 62/63] expected matches change --- .../tests/FunctionalTests/Regex.Match.Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 9e6db7976433b..6ad2275f9584b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2680,7 +2680,7 @@ public async Task MatchNonBacktrackingOver255Minterms() MatchCollection ms = r.Matches(inputString); Assert.Equal(1, ms.Count); Assert.Equal(0, ms[0].Index); - Assert.Equal(373, ms[0].Length); + Assert.Equal(273, ms[0].Length); } } #endif From 0a0f40982f089ca835306f37efc8eb5520e38f6e Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 10 Jul 2024 15:01:01 -0400 Subject: [PATCH 63/63] Review and clean up some code Simplification, style consistency, dead code deletion, some bounds-check removal, etc. --- .../RegularExpressions/RegexReplacement.cs | 1 - .../Symbolic/MatchReversal.cs | 42 +- .../Symbolic/MatchReversalKind.cs | 30 +- .../Symbolic/MatchingState.cs | 44 +- .../Symbolic/MintermClassifier.cs | 82 ++-- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 169 +++---- .../Symbolic/SymbolicRegexMatcher.cs | 443 ++++++++---------- .../Symbolic/SymbolicRegexNode.cs | 37 +- .../Symbolic/SymbolicRegexThresholds.cs | 11 +- .../FunctionalTests/Regex.Match.Tests.cs | 27 +- 10 files changed, 410 insertions(+), 476 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs index 49205f5ee2649..d2aec2621a81c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs @@ -5,7 +5,6 @@ using System.Collections.Generic; using System.Diagnostics; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; #pragma warning disable CS8500 // takes address of managed type diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs index cd00755dbe6dc..2ea1ea8af7422 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs @@ -1,15 +1,39 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -namespace System.Text.RegularExpressions.Symbolic; +using System.Diagnostics; -internal readonly struct MatchReversal( - MatchReversalKind kind, - int fixedLength, - MatchingState? adjustedStartState = null) - where TSet : IComparable, IEquatable +namespace System.Text.RegularExpressions.Symbolic { - internal MatchReversalKind Kind { get; } = kind; - internal int FixedLength { get; } = fixedLength; - internal MatchingState? AdjustedStartState { get; } = adjustedStartState; + /// Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed. + internal readonly struct MatchReversalInfo where TSet : IComparable, IEquatable + { + /// Initializes the match reversal details. + internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null) + { + Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength); + Debug.Assert(fixedLength >= 0); + Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength)); + + Kind = kind; + FixedLength = fixedLength; + AdjustedStartState = adjustedStartState; + } + + /// Gets the kind of the match reversal processing required. + internal MatchReversalKind Kind { get; } + + /// Gets the fixed length of the match, if one is known. + /// + /// For , this is ignored. + /// For , this is the full length of the match. The beginning may be found simply + /// by subtracting this length from the end. + /// For , this is the length of fixed portion of the match. + /// + internal int FixedLength { get; } + + /// Gets the adjusted start state to use for partial fixed-length matches. + /// This will be non-null iff is . + internal MatchingState? AdjustedStartState { get; } + } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs index d498e4dd7eb99..a949e6204a16a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs @@ -1,14 +1,26 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -namespace System.Text.RegularExpressions.Symbolic; - -internal enum MatchReversalKind +namespace System.Text.RegularExpressions.Symbolic { - /// The most generic option, run the regex backwards to find beginning of match - MatchStart, - /// Part of the reversal is fixed length and can be skipped - PartialFixedLength, - /// The entire pattern is fixed length, reversal not necessary - FixedLength + /// Specifies the kind of a . + internal enum MatchReversalKind + { + /// The regex should be run in reverse to find beginning of the match. + MatchStart, + + /// The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match. + /// + /// Reverse execution is not necessary for a subset of the match. + /// will contain the length of the fixed portion. + /// + PartialFixedLength, + + /// The entire pattern is of a fixed length. + /// + /// Reverse execution is not necessary to find the beginning of the match. + /// will contain the length of the match. + /// + FixedLength + } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 405be0318bbd5..3aacc4a61cbb9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -17,8 +17,6 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) NullabilityInfo = BuildNullabilityInfo(); } - internal int NullabilityInfo { get; } - /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -98,15 +96,31 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m return Node.CreateNfaDerivativeWithEffects(builder, minterm, context); } - /// - /// Cached nullability check with encoded bits - /// + /// Determines whether the node is nullable for the given context. + /// + /// This is functionally equivalent to , but using cached + /// answers stored in . + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) { - return ((1 << (int)nextCharKind) & NullabilityInfo) != 0; + Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount); + return (NullabilityInfo & (1 << (int)nextCharKind)) != 0; } + /// Gets the nullability info for the matching state. + /// + /// + /// 00000 -> node cannot be nullable + /// 00001 -> nullable for General + /// 00010 -> nullable for BeginningEnd + /// 00100 -> nullable for NewLine + /// 01000 -> nullable for NewLineS + /// 10000 -> nullable for WordLetter + /// + /// + internal int NullabilityInfo { get; } + /// /// Builds a with the relevant flags set. /// @@ -138,24 +152,16 @@ internal StateFlags BuildStateFlags(bool isInitial) return info; } - /// - /// Builds the nullability information for the matching state. - /// Nullability for each context is encoded in a bit - /// 0 means node cannot be nullable - /// 00001 -> nullable for General - /// 00010 -> nullable for BeginningEnd - /// 00100 -> nullable for NewLine - /// 01000 -> nullable for NewLineS - /// 10000 -> nullable for WordLetter - /// - internal byte BuildNullabilityInfo() + /// Builds the nullability information for the matching state. + /// Nullability for each context is encoded in a bit. See . + private byte BuildNullabilityInfo() { byte nullabilityInfo = 0; if (Node.CanBeNullable) { - for (uint ck = 0; ck < CharKind.CharKindCount; ck++) + for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++) { - nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0); + nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 7a1af1fb5496b..24d2a26f84922 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -1,7 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; namespace System.Text.RegularExpressions.Symbolic @@ -20,12 +22,12 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { - /// An array used to map characters to minterms + /// Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms. + /// _lookup[char] provides the minterm ID. If char >= _lookup.Length, its minterm is 0. private readonly byte[]? _lookup; - /// - /// Fallback lookup if over 255 minterms. This is rarely used. - /// + /// Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used. + /// _intLookup[char] provides the minterm ID. If char >= _intLookup.Length, its minterm is 0. private readonly int[]? _intLookup; /// Create a classifier that maps a character to the ID of its associated minterm. @@ -37,51 +39,54 @@ public MintermClassifier(BDD[] minterms) if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). - _lookup = Array.Empty(); + _lookup = []; return; } - int _maxChar = -1; - // attempt to save memory in common cases by allocating only up to the highest char code + // Compute all minterm ranges. We do this here in order to determine the maximum character value + // in order to size the lookup array to minimize steady-state memory consumption of the potentially + // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory + // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case. + // However, when there are more than 255 minterms, we need to use int[] _intLookup. + (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length); + + int maxChar = -1; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2); + (uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]); + charRangesPerMinterm[mintermId] = ranges; + maxChar = Math.Max(maxChar, (int)ranges[^1].Item2); } - // It's incredibly rare for a regex to use more than a hundred or two minterms, - // but we need a fallback just in case. + // It's incredibly rare for a regex to use more than a couple hundred minterms, + // but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.) if (minterms.Length > 255) { - // over 255 unique sets also means it's never ascii only - int[] lookup = new int[_maxChar + 1]; - for (int mintermId = 1; mintermId < minterms.Length; mintermId++) - { - // precompute all assigned minterm categories - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - foreach ((uint start, uint end) in mintermRanges) - { - // assign character ranges in bulk - Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - slice.Fill(mintermId); - } - } - _intLookup = lookup; + _intLookup = CreateLookup(minterms, charRangesPerMinterm, maxChar); } else { - byte[] lookup = new byte[_maxChar + 1]; + _lookup = CreateLookup(minterms, charRangesPerMinterm, maxChar); + } + + // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive. + Array.Clear(charRangesPerMinterm, 0, minterms.Length); + ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm); + + // Creates the lookup array. + static T[] CreateLookup(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger + { + T[] lookup = new T[_maxChar + 1]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - // precompute all assigned minterm categories - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - foreach ((uint start, uint end) in mintermRanges) + // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm. + foreach ((uint start, uint end) in charRangesPerMinterm[mintermId]) { - // assign character ranges in bulk - Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - slice.Fill((byte)mintermId); + lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId)); } } - _lookup = lookup; + + return lookup; } } @@ -89,9 +94,9 @@ public MintermClassifier(BDD[] minterms) [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { - if (_intLookup is null) + if (_lookup is not null) { - byte[] lookup = _lookup!; + byte[] lookup = _lookup; return (uint)c < (uint)lookup.Length ? lookup[c] : 0; } else @@ -104,20 +109,17 @@ public int GetMintermID(int c) /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms. /// Null if there are greater than 255 minterms. /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public byte[]? ByteLookup() => _lookup; + public byte[]? ByteLookup => _lookup; /// /// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms. /// Null in the common case where there are fewer than 255 minterms. /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int[]? IntLookup() => _intLookup; + public int[]? IntLookup => _intLookup; /// /// Maximum ordinal character for a non-0 minterm, used to conserve memory /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1; + public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 306704994c3de..327f5666f9e2a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -26,7 +26,7 @@ internal sealed partial class SymbolicRegexMatcher /// Cache for the states that have been created. Each state is uniquely identified by its associated /// and the kind of the previous character. /// - private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = new(); + private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = []; /// /// Maps state ids to states, initial capacity is given by . @@ -41,20 +41,14 @@ internal sealed partial class SymbolicRegexMatcher /// private StateFlags[] _stateFlagsArray; - /// - /// important: the pattern must not contain endZ for this to be valid. - /// Used to short-circuit nullability in the hot loop - /// nullability for each context is encoded in a bit - /// 0 means node cannot be nullable - /// 00001 -> nullable for General - /// 00010 -> nullable for BeginningEnd - /// 00100 -> nullable for NewLine - /// 01000 -> nullable for NewLineS - /// 10000 -> nullable for WordLetter - /// + /// Cached nullability info for each state ID. + /// + /// _nullabilityArray[stateId] == the for that state. + /// Used to short-circuit nullability in the hot loop. + /// Important: the pattern must not contain endZ for this to be valid. + /// private byte[] _nullabilityArray; - /// /// The transition function for DFA mode. /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is @@ -84,7 +78,7 @@ internal sealed partial class SymbolicRegexMatcher /// It is the inverse of used entries in _nfaStateArray. /// The range of this map is 0 to its size - 1. /// - private readonly Dictionary _nfaIdByCoreId = new(); + private readonly Dictionary _nfaIdByCoreId = []; /// /// Transition function for NFA transitions in NFA mode. @@ -127,7 +121,7 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private bool IsNullableWithContext(int stateId, int mintermId) => - ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0; + (_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0; /// Returns the span from that may contain transitions for the given state private Span GetDeltasFor(MatchingState state) @@ -175,98 +169,75 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint } /// - /// Optimized reversal state computation during construction which - /// skips the fixed length parts of reversal - /// e.g. for the pattern abc.*def + /// Analyze the specified reversed pattern to gather details that help to optimize the reverse matching process + /// for when finding the beginning of a match. + /// + /// + /// Optimized reversal state computation during construction which skips the fixed length suffix, e.g. for the pattern abc.*def /// 1) the end is found at abc.*def| /// 2) the reversal starts at abc.*| - /// - /// reversed initial pattern - /// returns num of chars to skip and adjusted reversal start state - private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node) + /// + /// Reversed initial pattern + /// The match reversal details. + private MatchReversalInfo CreateOptimizedReversal(SymbolicRegexNode node) { int pos = 0; - SymbolicRegexNode current = node; - bool canLoop = true; - - while (canLoop) + while (true) { - (bool loop, SymbolicRegexNode next) = current switch + if (node._info.ContainsSomeAnchor) { - // Bail if it contains any anchors. (This could potentially be a very good future optimization for - // anchors but there's too many edge cases to guarantee it works. - // one example which fails currently: pattern: @"\By\b", input: "xy") - { _info.ContainsSomeAnchor: true } => Bail(), - - // if this is reached then entire match is fixed length - { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), - - { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!), - - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!), - - {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => AddSingleton(current), - - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => - AddFixedLengthLoop(current), - - _ => (false, current) - }; - canLoop = loop; - current = next; - } - - - return - pos <= 0 ? new MatchReversal(MatchReversalKind.MatchStart, 0) : - current == _builder.Epsilon ? new MatchReversal(MatchReversalKind.FixedLength, pos) : - new MatchReversal(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); + // Bail if it contains any anchors as it invalidates the optimization. + // (This could potentially be a very good future optimization for anchors but there's too many edge cases to guarantee it works. + // One example which fails currently: pattern: @"\By\b", input: "xy") + pos = 0; + break; + } - // finding anchors inside pattern invalidates this optimization - (bool, SymbolicRegexNode) Bail() - { - pos = 0; - // return original node - return (false, node); - } + if (node._kind is not SymbolicRegexNodeKind.Concat) + { + if (node._kind is SymbolicRegexNodeKind.CaptureStart) + { + node = _builder.Epsilon; // The entire match is fixed length. + } + break; + } - (bool, SymbolicRegexNode) AddSingleton(SymbolicRegexNode concatNode) - { - pos += 1; - // continue with next concat - return (true, concatNode._right!); - } + SymbolicRegexNode? left = node._left; + Debug.Assert(left is not null); - (bool, SymbolicRegexNode) AddFixedLengthLoop(SymbolicRegexNode concatNode) - { - SymbolicRegexNode? loopNode = concatNode._left; - if (loopNode is { _lower: <= 0 }) + if (left._kind is SymbolicRegexNodeKind.CaptureEnd or SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.Singleton) { - return (false, concatNode); + node = node._right!; + if (left._kind is SymbolicRegexNodeKind.Singleton) + { + pos++; + } } + else if (left._kind is SymbolicRegexNodeKind.Loop) + { + if (left._lower <= 0 || left._left!.Kind is not SymbolicRegexNodeKind.Singleton) + { + break; + } - switch (loopNode!._left!.Kind) + node = left._lower == left._upper ? + node._right! : // The entire loop is fixed + _builder.CreateConcat( // Subtract the fixed part of the loop. + _builder.CreateLoop(left._left, left.IsLazy, 0, left._upper - left._lower), + node._right!); + pos += left._lower; + } + else { - case SymbolicRegexNodeKind.Singleton: - - if (loopNode._lower == loopNode._upper) - { - pos += loopNode._lower; - // the entire loop is fixed, continue - return (true, concatNode._right!); - } - - // subtract the fixed part of the loop - int loopRemainder = loopNode._upper - loopNode._lower; - SymbolicRegexNode newLeft = - _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); - SymbolicRegexNode newNode = _builder.CreateConcat(newLeft, concatNode._right!); - pos += loopNode._lower; - return (true, newNode); - default: - return (false, concatNode); + break; } } + + Debug.Assert(pos >= 0); + return + pos == 0 ? new MatchReversalInfo(MatchReversalKind.MatchStart, 0) : + node == _builder.Epsilon ? new MatchReversalInfo(MatchReversalKind.FixedLength, pos) : + new MatchReversalInfo(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(node), 0)); } /// @@ -299,7 +270,7 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node } _stateArray[state.Id] = state; _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState); - _nullabilityArray[state.Id] = state.BuildNullabilityInfo(); + _nullabilityArray[state.Id] = (byte)state.NullabilityInfo; } return state; @@ -395,11 +366,8 @@ private bool TryCreateNewTransition( MatchingState? targetState = _stateArray[_dfaDelta[offset]]; if (targetState is null) { - if (// check if there is an active timer - (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || - // check if amount of nodes exceeds the NFA threshold - (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold) - ) + if ((timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || // if there's an active timer + (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)) // if # of nodes exceeds the NFA threshold { nextState = null; return false; @@ -438,7 +406,7 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse SymbolicRegexNode targetNode = coreTargetId > 0 ? GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind); - List targetsList = new(); + List targetsList = []; ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List targetsList) => targetsList.Add(nfaId)); @@ -465,8 +433,9 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse TSet minterm = GetMintermFromId(mintermId); uint nextCharKind = GetPositionKind(mintermId); List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind); + // Build the new state and store it into the array. - List<(int, DerivativeEffect[])> targetsList = new(); + List<(int, DerivativeEffect[])> targetsList = []; foreach ((SymbolicRegexNode Node, DerivativeEffect[] Effects) entry in transition) { ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects), diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index b80314c742840..08f423b03344a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Buffers; using System.Collections.Generic; using System.Diagnostics; using System.IO; @@ -84,17 +83,16 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// /// Dead end state to quickly return NoMatch. - /// This could potentially be a constant if it's the very first state created /// private readonly int _deadStateId; - /// Initial state used for vectorization + /// Initial state used for vectorization. private readonly int _initialStateId; - /// Whether the pattern contains any anchor + /// Whether the pattern contains any anchor. private readonly bool _containsAnyAnchor; - /// Whether the pattern contains the EndZ anchor, which makes most optimization shortcuts invalid + /// Whether the pattern contains the EndZ anchor, which invalidates most optimization shortcuts. private readonly bool _containsEndZAnchor; /// The initial states for the original pattern, keyed off of the previous character kind. @@ -109,10 +107,8 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _reverseInitialStates; - /// - /// Reversal state which skips fixed length parts. - /// - private readonly MatchReversal _optimizedReversalState; + /// Details on optimized processing of the reverse of the pattern to find the beginning of a match. + private readonly MatchReversalInfo _optimizedReversalInfo; /// Partition of the input space of sets. private readonly TSet[] _minterms; @@ -190,8 +186,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo ((BitVectorSolver)(object)builder._solver)._classifier; _capsize = captureCount; - - // Initialization for fields in SymbolicRegexMatcher.Automata.cs + // Initialize state and nullability arrays. _stateArray = new MatchingState[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; _nullabilityArray = new byte[InitialDfaStateCapacity]; @@ -206,8 +201,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId); } - // Create optimized reversal - _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder)); + // Gather optimized reversal processing information. + _optimizedReversalInfo = CreateOptimizedReversal(_pattern.Reverse(builder)); // Store the find optimizations that can be used to jump ahead to the next possible starting location. // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's @@ -251,9 +246,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo } _dotstarredInitialStates = dotstarredInitialStates; - // Assign dead state id + // Assign dead and initial state ids _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id; - // Assign initial state id _initialStateId = _dotstarredInitialStates[CharKind.General].Id; // Create the reverse pattern (the original pattern in reverse order) and all of its @@ -378,38 +372,25 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. - // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases + // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases. int matchEnd; - if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null)) + if (!_containsEndZAnchor && _mintermClassifier.IntLookup is null) { + // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { - (true, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else { - // fallback for Z anchor or over 255 minterms - matchEnd = (_findOpts is not null) switch - { - true => - FindEndPositionFallback( - input, startat, timeoutOccursAt, mode, perThreadData), - false => - FindEndPositionFallback( - input, startat, timeoutOccursAt, mode, perThreadData), - }; + // Fallback for Z anchor or over 255 minterms + matchEnd = _findOpts is not null ? + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); } // If there wasn't a match, we're done. @@ -431,60 +412,57 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that // exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from // that last b until it finds the 4th a: aaabbbc. - int matchStart; + int matchStart = 0; Debug.Assert(matchEnd >= startat - 1); - switch (_optimizedReversalState.Kind) + switch (_optimizedReversalInfo.Kind) { - case MatchReversalKind.FixedLength: - matchStart = (matchEnd - _optimizedReversalState.FixedLength); - break; - case MatchReversalKind.MatchStart: case MatchReversalKind.PartialFixedLength: int initialLastStart = -1; // invalid sentinel value int i = matchEnd; CurrentState reversalStartState; - if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength) + + if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart) { - i -= _optimizedReversalState.FixedLength; - reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!); + // No fixed-length knowledge. Start at the end of the match. + reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); + } + else + { + // There's a fixed-length portion at the end of the match. Start just before it. + i -= _optimizedReversalInfo.FixedLength; + reversalStartState = new CurrentState(_optimizedReversalInfo.AdjustedStartState!); // reversal may already be nullable here in the case of anchors - if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0) - { - if (FullNullabilityHandler.IsNullableAt(this, - in reversalStartState, FullInputReader.GetPositionId(this, input, i), + if (_containsAnyAnchor && + _nullabilityArray[reversalStartState.DfaStateId] > 0 && + FullNullabilityHandler.IsNullableAt( + this, in reversalStartState, FullInputReader.GetPositionId(this, input, i), DfaStateHandler.GetStateFlags(this, in reversalStartState))) - { - initialLastStart = i; - } + { + initialLastStart = i; } } - else - { - reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); - } - matchStart = matchEnd < startat - ? startat - : (_containsEndZAnchor, _containsAnyAnchor) switch + matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch { - (true, true) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), - (true, false) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), - (false, true) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), - (false, false) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), + // Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization; + // the (true, true) case is functionally complete whereas the (false, false) case is the most optimized. + (true, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (true, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), }; break; + + case MatchReversalKind.FixedLength: + // The whole match is known to be of a fixed length, so we don't need to do any processing to find its beginning, just jump there. + matchStart = matchEnd - _optimizedReversalInfo.FixedLength; + break; + default: - throw new ArgumentOutOfRangeException(); + Debug.Fail($"Unexpected reversal kind: {_optimizedReversalInfo.Kind}"); + break; } // Phase 3: @@ -513,8 +491,7 @@ private int FindEndPositionOptimized(input, pos - 1)]); int endPos = NoMatchExists; @@ -527,26 +504,19 @@ private int FindEndPositionOptimized DfaCharsPerTimeoutCheck - ? pos + DfaCharsPerTimeoutCheck - : lengthMinus1; - done = - FindEndPositionDeltasDFAOptimized< - TAcceleratedStateHandler, - TOptimizedNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, - ref currentState.DfaStateId, ref endPos); + innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1; + done = FindEndPositionDeltasDFAOptimized( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState.DfaStateId, ref endPos); } else { - // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here - const int NfaCharsPerTimeoutCheck = 1000; - innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck - ? pos + NfaCharsPerTimeoutCheck - : input.Length; - done = - FindEndPositionDeltasNFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, - ref initialStatePosCandidate, ref initialStatePosCandidate); + // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here and it's not worth special-casing. + const int NfaCharsPerTimeoutCheck = 1_000; + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -611,27 +581,18 @@ private int FindEndPositionFallback DfaCharsPerTimeoutCheck - ? pos + DfaCharsPerTimeoutCheck - : input.Length; - done = - FindEndPositionDeltasDFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePosCandidate); + const int DfaCharsPerTimeoutCheck = 25_000; + innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasDFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } else { - // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here - // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms - const int NfaCharsPerTimeoutCheck = 1000; - innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck - ? pos + NfaCharsPerTimeoutCheck - : input.Length; - done = - FindEndPositionDeltasNFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePosCandidate); + // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here. + const int NfaCharsPerTimeoutCheck = 1_000; + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -665,18 +626,17 @@ private int FindEndPositionFallback /// This version of uses a different set of interfaces, - /// which don't check for many inner loop edge cases e.g. input end or '\n'. + /// which don't check for many inner loop edge cases, e.g. input end or '\n'. /// All edge cases are handled before entering the loop. /// - private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFAOptimized( + ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler { - // initial check for input end to get it out of the loop + // Initial check for input end lifted out of the subsequent hot-path loop. if (posRef == input.Length) - { if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0])) { @@ -688,12 +648,12 @@ private int FindEndPositionFallback= lengthMinus1) { if (pos + 1 < input.Length) @@ -755,6 +712,7 @@ private int FindEndPositionFallback(this, input, ref state, ref pos)) { @@ -827,8 +787,7 @@ private bool FindEndPositionDeltasDFA(this, in state, - positionId, TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { endPos = pos; @@ -1178,7 +1137,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, } Debug.Assert(current.Count > 0); - foreach (var (endStateId, endRegisters) in current.Values) + foreach ((int endStateId, Registers endRegisters) in current.Values) { MatchingState endState = GetState(GetCoreStateId(endStateId)); if (endState.IsNullableFor(GetCharKind(input, iEnd))) @@ -1194,6 +1153,16 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, return default; } + /// Look up the min term ID for the character at the specified position in the input. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int GetMintermId(byte[] mintermLookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos >= 0 && pos < input.Length); + + char c = input[pos]; + return c < (uint)mintermLookup.Length ? mintermLookup[c] : 0; + } + /// Stores additional data for tracking capture start and end positions. /// The NFA simulation based third phase has one of these for each current state in the current set of live states. internal struct Registers(int[] captureStarts, int[] captureEnds) @@ -1442,8 +1411,8 @@ internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, re /// - whether this state may be contextually nullable /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) - => matcher._stateFlagsArray[state.DfaStateId]; + public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) => + matcher._stateFlagsArray[state.DfaStateId]; } /// An for operating over instances configured as NFA states. @@ -1594,16 +1563,16 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher< [MethodImpl(MethodImplOptions.AggressiveInlining)] public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) { - SparseIntMap stateSet = state.NfaState!.NfaStateSet; // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if // they are true for any state in the set; SimulatesBacktracking is true for all the states if // it is true for any state (since it is a phase-wide property); and all other flags are masked out. StateFlags flags = 0; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(stateSet.Values)) + foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)]; } + return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag); } @@ -1628,61 +1597,6 @@ public static void UndoTransition(ref CurrentState state) #endif } - - - // /// - // /// This reader maps all characters > maxChar to 0 - // /// - private readonly struct OptimizedSmallInputReader - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - { - Debug.Assert(pos < input.Length, "pos < input.Length"); - Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}"); - char c = input[pos]; - return c < (uint)lookup.Length ? lookup[c] : 0; - } - } - - /// - /// This nullability handler interface can be used in DFAs - /// for patterns that do not contain \Z - /// - private interface IOptimizedNullabilityHandler - { - public static abstract bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, - int pos); - } - - private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, - int maxChar, ReadOnlySpan input, int pos) - { - Debug.Assert(pos < input.Length, "input end should not be handled here"); - Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); - return nullabilityArray[currStateId] > 0; - } - } - - private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - { - Debug.Assert(pos < input.Length, "input end should not be handled here"); - Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); - return - nullabilityArray[currStateId] > 0 && - matcher.IsNullableWithContext(currStateId, - input[pos] < (uint)lookup.Length ? lookup[input[pos]] : 0); - } - } - /// /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to /// both take transitions and decide nullability. For positions of valid characters that are handled normally, @@ -1695,13 +1609,11 @@ private interface IInputReader public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); } - - /// This reader omits the special handling of \n for the \Z anchor. private readonly struct NoZAnchorInputReader : IInputReader { public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => - (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]); + (uint)pos < (uint)input.Length ? matcher._mintermClassifier.GetMintermID(input[pos]) : -1; } /// This reader includes full handling of an \n as the last character of input for the \Z anchor. @@ -1709,23 +1621,24 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan { public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) { - if ((uint)pos >= (uint)input.Length) - return -1; - - int c = input[pos]; + if ((uint)pos < (uint)input.Length) + { + // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor + int c = input[pos]; + return c == '\n' && pos == input.Length - 1 ? + matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input + matcher._mintermClassifier.GetMintermID(c); + } - // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor - return c == '\n' && pos == input.Length - 1 ? - matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input - matcher._mintermClassifier.GetMintermID(c); + return -1; } } - private interface IInitialStateHandler { - public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - ReadOnlySpan input, ref CurrentState state, ref int pos) + public static abstract bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, + ref CurrentState state, ref int pos) where TInputReader : struct, IInputReader; } @@ -1735,54 +1648,55 @@ public static abstract bool TryFindNextStartingPosition(SymbolicRe private interface IAcceleratedStateHandler { public static abstract bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, - ref int currentStateId, ref int pos, int initialStateId); + SymbolicRegexMatcher matcher, ReadOnlySpan input, + byte[] lookup, ref int currentStateId, ref int pos, int initialStateId); } private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) { if (currentStateId != initialStateId) { return false; } - if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - return true; + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; } - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; return true; } } + private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - byte[] lookup, - ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) + ReadOnlySpan input, + byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) { if (currentStateId != initialStateId) + { return false; + } if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - currentStateId = matcher._dotstarredInitialStates[ - matcher._positionKinds[ - OptimizedSmallInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1] - ].Id; - return true; + currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input, pos - 1) + 1]].Id; + } + else + { + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; } - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; return true; } } @@ -1790,26 +1704,18 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - byte[] lookup, - ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) - { - return false; - } + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) => + false; } - /// - /// No-op handler for when there are no initial state optimizations to apply. - /// + /// No-op handler for when there are no initial state optimizations to apply. private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader - { - // return true to indicate that the current position is a possible starting position - return true; - } + where TInputReader : struct, IInputReader => + true; // the current position is a possible starting position } /// @@ -1822,26 +1728,33 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatche where TInputReader : struct, IInputReader { // Find the first position that matches with some likely character. - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - // No match exists - return false; + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // As with the initial starting state, if it's a dead end, no match exists. + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + return true; } - // Update the starting state based on where TryFindNextStartingPosition moved us to. - // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); - return true; + // No match exists + return false; } } - /// - /// Interface for evaluating nullability of states. - /// + /// Interface for evaluating nullability of states. private interface INullabilityHandler { - public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler; + public static abstract bool IsNullableAt( + SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) + where TStateHandler : struct, IStateHandler; + } + + /// This nullability handler interface can be used in DFAs for patterns that do not contain \Z. + private interface IOptimizedNullabilityHandler + { + public static abstract bool IsNullable( + SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, + byte[] lookup, ReadOnlySpan input, int pos); } /// @@ -1865,9 +1778,37 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler + where TStateHandler : struct, IStateHandler => + flags.IsNullable() || + (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + } + + private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) { - return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); + Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); + return nullabilityArray[currStateId] > 0; + } + } + + private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); + Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); + + if (nullabilityArray[currStateId] > 0) + { + char c = input[pos]; + return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0); + } + + return false; } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 4309054c354e6..5384810092b7f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -185,7 +185,7 @@ internal bool CanBeNullable public List> ToList(List>? list = null, SymbolicRegexNodeKind listKind = SymbolicRegexNodeKind.Concat) { Debug.Assert(listKind is SymbolicRegexNodeKind.Concat or SymbolicRegexNodeKind.Alternate); - list ??= new List>(); + list ??= []; AppendToList(this, list, listKind); return list; @@ -394,10 +394,11 @@ SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is - SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or - SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor, - kind is SymbolicRegexNodeKind.EndAnchorZ)); + return Create( + builder, kind, null, null, -1, -1, default, + SymbolicRegexInfo.Anchor( + isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor, + isEndZAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ)); } #endregion @@ -541,8 +542,8 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder> seenElems = new(); // Keep track of if any elements from the right side need to be eliminated + HashSet> seenElems = []; bool rightChanged = false; for (int i = 0; i < elems.Count; i++) { @@ -836,7 +837,7 @@ private static bool TryFoldAlternation(SymbolicRegexBuilder builder, Symbo static bool TrySplitConcatSubsumption(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? prefix) { - List> prefixElements = new(); + List> prefixElements = []; SymbolicRegexNode suffix = right; while (suffix._kind == SymbolicRegexNodeKind.Concat) { @@ -1052,7 +1053,7 @@ public SymbolicRegexNode AddFixedLengthMarkers(SymbolicRegexBuilder /// the derivative internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder builder, TSet elem, uint context) { - List<(SymbolicRegexNode, DerivativeEffect[])> transitions = new(); + List<(SymbolicRegexNode, DerivativeEffect[])> transitions = []; CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions); return transitions; } @@ -1085,9 +1086,8 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(SymbolicRegexB return this; // Cache result to avoid otherwise potential quadratic worst case behavior - SymbolicRegexNode? prunedNode; (SymbolicRegexNode, uint) key = (this, context); - if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) + if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out SymbolicRegexNode? prunedNode)) { return prunedNode; } @@ -1254,9 +1254,8 @@ private SymbolicRegexNode CreateDerivative(SymbolicRegexBuilder buil return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context); } - SymbolicRegexNode? derivative; (SymbolicRegexNode, TSet, uint) key = (this, elem, context); - if (builder._derivativeCache.TryGetValue(key, out derivative)) + if (builder._derivativeCache.TryGetValue(key, out SymbolicRegexNode? derivative)) { return derivative; } @@ -1434,7 +1433,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex return; } - currentEffects ??= new List(); + currentEffects ??= []; // If we've reached a node with no effects, then output that with the effects that have been accumulated if (!_info.ContainsEffect) @@ -1469,7 +1468,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { - var (node, effects) = alternativesAndEffects[i]; + (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i]; alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects); } break; @@ -1507,7 +1506,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { - var (node, effects) = alternativesAndEffects[i]; + (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i]; alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects); } break; @@ -1896,12 +1895,8 @@ private void CollectSets(SymbolicRegexBuilder builder, HashSet sets) } /// Compute and sort all the minterms from the sets in this regex. - public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) - { - HashSet sets = GetSets(builder); - List minterms = MintermGenerator.GenerateMinterms(builder._solver, sets); - return minterms.ToArray(); - } + public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) => + MintermGenerator.GenerateMinterms(builder._solver, GetSets(builder)).ToArray(); /// /// Create the reverse of this regex diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index bf7d5a6501699..5d73a3e232e80 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -17,11 +17,8 @@ internal static class SymbolicRegexThresholds /// an NFA. As an NFA, we instead track all of the states we're in at any given point. /// /// - /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance - /// is currently approx. 50 MB. - /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially - /// significant search-time performance gains. Worst case memory consumption for the regex instance - /// can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) + /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance is currently ~50 MB. + /// Worst case memory consumption for the regex instance can be approximated to ~(NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state. /// internal const int NfaNodeCountThreshold = 125_000; @@ -34,8 +31,8 @@ internal static class SymbolicRegexThresholds /// This default value may be overridden with the AppContext data /// whose name is given by . /// - /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s, - /// although it could be safely raised higher at the expense of worst-case NFA performance + /// This limit is chosen due to worst case NFA speed constraints, + /// although it could be safely raised higher at the expense of worst-case NFA performance. /// internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 6ad2275f9584b..1f0e2932c6425 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2653,36 +2653,25 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } }; } } -#if NET + [Fact] public async Task MatchNonBacktrackingOver255Minterms() { - // This is a test for the rare over 255 unique minterms case in MintermClassifier - StringBuilder pattern = new(); - StringBuilder input = new(); - for (int i = 128; i <= 400; i++) - { - char c = (char)i; - pattern.Append(c); - // adding an optional char as well just so it's not a string literal - pattern.Append(c); - pattern.Append('?'); - // input is the pattern itself - input.Append(c); - } + // While valid on all engines, this test in particular is designed to exercise the rare case + // of more than 255 unique minterms case in the non-backtracking engine's minterm classifier. - string patternString = pattern.ToString(); - string inputString = input.ToString(); + IEnumerable chars = Enumerable.Range(128, 400 - 128).Select(i => (char)i); + string patternString = string.Concat(chars.Select(c => $"{c}{c}?")); // adding an optional char as well just so it's not a string literal + string inputString = string.Concat(chars); foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); + Regex r = await RegexHelpers.GetRegexAsync(engine, patternString); MatchCollection ms = r.Matches(inputString); Assert.Equal(1, ms.Count); Assert.Equal(0, ms[0].Index); - Assert.Equal(273, ms[0].Length); + Assert.Equal(272, ms[0].Length); } } -#endif } }