From 6bb517af08271b1f5afda3de78155bf492576790 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 10 Jul 2024 15:01:01 -0400 Subject: [PATCH] Review and clean up some code Simplification, style consistency, dead code deletion, some bounds-check removal, etc. --- .../RegularExpressions/RegexReplacement.cs | 1 - .../Symbolic/MatchReversal.cs | 42 +- .../Symbolic/MatchReversalKind.cs | 30 +- .../Symbolic/MatchingState.cs | 44 ++- .../Symbolic/MintermClassifier.cs | 82 ++-- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 169 ++++---- .../Symbolic/SymbolicRegexMatcher.cs | 364 ++++++++---------- .../Symbolic/SymbolicRegexNode.cs | 37 +- .../Symbolic/SymbolicRegexThresholds.cs | 11 +- .../FunctionalTests/Regex.Match.Tests.cs | 27 +- 10 files changed, 370 insertions(+), 437 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs index 49205f5ee2649f..d2aec2621a81c8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs @@ -5,7 +5,6 @@ using System.Collections.Generic; using System.Diagnostics; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; #pragma warning disable CS8500 // takes address of managed type diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs index cd00755dbe6dcf..2ea1ea8af7422c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs @@ -1,15 +1,39 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -namespace System.Text.RegularExpressions.Symbolic; +using System.Diagnostics; -internal readonly struct MatchReversal( - MatchReversalKind kind, - int fixedLength, - MatchingState? adjustedStartState = null) - where TSet : IComparable, IEquatable +namespace System.Text.RegularExpressions.Symbolic { - internal MatchReversalKind Kind { get; } = kind; - internal int FixedLength { get; } = fixedLength; - internal MatchingState? AdjustedStartState { get; } = adjustedStartState; + /// Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed. + internal readonly struct MatchReversalInfo where TSet : IComparable, IEquatable + { + /// Initializes the match reversal details. + internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null) + { + Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength); + Debug.Assert(fixedLength >= 0); + Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength)); + + Kind = kind; + FixedLength = fixedLength; + AdjustedStartState = adjustedStartState; + } + + /// Gets the kind of the match reversal processing required. + internal MatchReversalKind Kind { get; } + + /// Gets the fixed length of the match, if one is known. + /// + /// For , this is ignored. + /// For , this is the full length of the match. The beginning may be found simply + /// by subtracting this length from the end. + /// For , this is the length of fixed portion of the match. + /// + internal int FixedLength { get; } + + /// Gets the adjusted start state to use for partial fixed-length matches. + /// This will be non-null iff is . + internal MatchingState? AdjustedStartState { get; } + } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs index d498e4dd7eb99c..a949e6204a16a3 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs @@ -1,14 +1,26 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -namespace System.Text.RegularExpressions.Symbolic; - -internal enum MatchReversalKind +namespace System.Text.RegularExpressions.Symbolic { - /// The most generic option, run the regex backwards to find beginning of match - MatchStart, - /// Part of the reversal is fixed length and can be skipped - PartialFixedLength, - /// The entire pattern is fixed length, reversal not necessary - FixedLength + /// Specifies the kind of a . + internal enum MatchReversalKind + { + /// The regex should be run in reverse to find beginning of the match. + MatchStart, + + /// The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match. + /// + /// Reverse execution is not necessary for a subset of the match. + /// will contain the length of the fixed portion. + /// + PartialFixedLength, + + /// The entire pattern is of a fixed length. + /// + /// Reverse execution is not necessary to find the beginning of the match. + /// will contain the length of the match. + /// + FixedLength + } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index 405be0318bbd55..3aacc4a61cbb94 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -17,8 +17,6 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) NullabilityInfo = BuildNullabilityInfo(); } - internal int NullabilityInfo { get; } - /// The regular expression that labels this state and gives it its semantics. internal SymbolicRegexNode Node { get; } @@ -98,15 +96,31 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m return Node.CreateNfaDerivativeWithEffects(builder, minterm, context); } - /// - /// Cached nullability check with encoded bits - /// + /// Determines whether the node is nullable for the given context. + /// + /// This is functionally equivalent to , but using cached + /// answers stored in . + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) { - return ((1 << (int)nextCharKind) & NullabilityInfo) != 0; + Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount); + return (NullabilityInfo & (1 << (int)nextCharKind)) != 0; } + /// Gets the nullability info for the matching state. + /// + /// + /// 00000 -> node cannot be nullable + /// 00001 -> nullable for General + /// 00010 -> nullable for BeginningEnd + /// 00100 -> nullable for NewLine + /// 01000 -> nullable for NewLineS + /// 10000 -> nullable for WordLetter + /// + /// + internal int NullabilityInfo { get; } + /// /// Builds a with the relevant flags set. /// @@ -138,24 +152,16 @@ internal StateFlags BuildStateFlags(bool isInitial) return info; } - /// - /// Builds the nullability information for the matching state. - /// Nullability for each context is encoded in a bit - /// 0 means node cannot be nullable - /// 00001 -> nullable for General - /// 00010 -> nullable for BeginningEnd - /// 00100 -> nullable for NewLine - /// 01000 -> nullable for NewLineS - /// 10000 -> nullable for WordLetter - /// - internal byte BuildNullabilityInfo() + /// Builds the nullability information for the matching state. + /// Nullability for each context is encoded in a bit. See . + private byte BuildNullabilityInfo() { byte nullabilityInfo = 0; if (Node.CanBeNullable) { - for (uint ck = 0; ck < CharKind.CharKindCount; ck++) + for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++) { - nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0); + nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 7a1af1fb5496b9..24d2a26f849229 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -1,7 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; namespace System.Text.RegularExpressions.Symbolic @@ -20,12 +22,12 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { - /// An array used to map characters to minterms + /// Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms. + /// _lookup[char] provides the minterm ID. If char >= _lookup.Length, its minterm is 0. private readonly byte[]? _lookup; - /// - /// Fallback lookup if over 255 minterms. This is rarely used. - /// + /// Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used. + /// _intLookup[char] provides the minterm ID. If char >= _intLookup.Length, its minterm is 0. private readonly int[]? _intLookup; /// Create a classifier that maps a character to the ID of its associated minterm. @@ -37,51 +39,54 @@ public MintermClassifier(BDD[] minterms) if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). - _lookup = Array.Empty(); + _lookup = []; return; } - int _maxChar = -1; - // attempt to save memory in common cases by allocating only up to the highest char code + // Compute all minterm ranges. We do this here in order to determine the maximum character value + // in order to size the lookup array to minimize steady-state memory consumption of the potentially + // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory + // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case. + // However, when there are more than 255 minterms, we need to use int[] _intLookup. + (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length); + + int maxChar = -1; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2); + (uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]); + charRangesPerMinterm[mintermId] = ranges; + maxChar = Math.Max(maxChar, (int)ranges[^1].Item2); } - // It's incredibly rare for a regex to use more than a hundred or two minterms, - // but we need a fallback just in case. + // It's incredibly rare for a regex to use more than a couple hundred minterms, + // but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.) if (minterms.Length > 255) { - // over 255 unique sets also means it's never ascii only - int[] lookup = new int[_maxChar + 1]; - for (int mintermId = 1; mintermId < minterms.Length; mintermId++) - { - // precompute all assigned minterm categories - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - foreach ((uint start, uint end) in mintermRanges) - { - // assign character ranges in bulk - Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - slice.Fill(mintermId); - } - } - _intLookup = lookup; + _intLookup = CreateLookup(minterms, charRangesPerMinterm, maxChar); } else { - byte[] lookup = new byte[_maxChar + 1]; + _lookup = CreateLookup(minterms, charRangesPerMinterm, maxChar); + } + + // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive. + Array.Clear(charRangesPerMinterm, 0, minterms.Length); + ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm); + + // Creates the lookup array. + static T[] CreateLookup(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger + { + T[] lookup = new T[_maxChar + 1]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { - // precompute all assigned minterm categories - (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]); - foreach ((uint start, uint end) in mintermRanges) + // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm. + foreach ((uint start, uint end) in charRangesPerMinterm[mintermId]) { - // assign character ranges in bulk - Span slice = lookup.AsSpan((int)start, (int)(end + 1 - start)); - slice.Fill((byte)mintermId); + lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId)); } } - _lookup = lookup; + + return lookup; } } @@ -89,9 +94,9 @@ public MintermClassifier(BDD[] minterms) [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { - if (_intLookup is null) + if (_lookup is not null) { - byte[] lookup = _lookup!; + byte[] lookup = _lookup; return (uint)c < (uint)lookup.Length ? lookup[c] : 0; } else @@ -104,20 +109,17 @@ public int GetMintermID(int c) /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms. /// Null if there are greater than 255 minterms. /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public byte[]? ByteLookup() => _lookup; + public byte[]? ByteLookup => _lookup; /// /// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms. /// Null in the common case where there are fewer than 255 minterms. /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int[]? IntLookup() => _intLookup; + public int[]? IntLookup => _intLookup; /// /// Maximum ordinal character for a non-0 minterm, used to conserve memory /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1; + public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 306704994c3dee..327f5666f9e2a5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -26,7 +26,7 @@ internal sealed partial class SymbolicRegexMatcher /// Cache for the states that have been created. Each state is uniquely identified by its associated /// and the kind of the previous character. /// - private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = new(); + private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = []; /// /// Maps state ids to states, initial capacity is given by . @@ -41,20 +41,14 @@ internal sealed partial class SymbolicRegexMatcher /// private StateFlags[] _stateFlagsArray; - /// - /// important: the pattern must not contain endZ for this to be valid. - /// Used to short-circuit nullability in the hot loop - /// nullability for each context is encoded in a bit - /// 0 means node cannot be nullable - /// 00001 -> nullable for General - /// 00010 -> nullable for BeginningEnd - /// 00100 -> nullable for NewLine - /// 01000 -> nullable for NewLineS - /// 10000 -> nullable for WordLetter - /// + /// Cached nullability info for each state ID. + /// + /// _nullabilityArray[stateId] == the for that state. + /// Used to short-circuit nullability in the hot loop. + /// Important: the pattern must not contain endZ for this to be valid. + /// private byte[] _nullabilityArray; - /// /// The transition function for DFA mode. /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is @@ -84,7 +78,7 @@ internal sealed partial class SymbolicRegexMatcher /// It is the inverse of used entries in _nfaStateArray. /// The range of this map is 0 to its size - 1. /// - private readonly Dictionary _nfaIdByCoreId = new(); + private readonly Dictionary _nfaIdByCoreId = []; /// /// Transition function for NFA transitions in NFA mode. @@ -127,7 +121,7 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private bool IsNullableWithContext(int stateId, int mintermId) => - ((1 << (int)GetPositionKind(mintermId)) & _nullabilityArray[stateId]) > 0; + (_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0; /// Returns the span from that may contain transitions for the given state private Span GetDeltasFor(MatchingState state) @@ -175,98 +169,75 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint } /// - /// Optimized reversal state computation during construction which - /// skips the fixed length parts of reversal - /// e.g. for the pattern abc.*def + /// Analyze the specified reversed pattern to gather details that help to optimize the reverse matching process + /// for when finding the beginning of a match. + /// + /// + /// Optimized reversal state computation during construction which skips the fixed length suffix, e.g. for the pattern abc.*def /// 1) the end is found at abc.*def| /// 2) the reversal starts at abc.*| - /// - /// reversed initial pattern - /// returns num of chars to skip and adjusted reversal start state - private MatchReversal CreateOptimizedReversal(SymbolicRegexNode node) + /// + /// Reversed initial pattern + /// The match reversal details. + private MatchReversalInfo CreateOptimizedReversal(SymbolicRegexNode node) { int pos = 0; - SymbolicRegexNode current = node; - bool canLoop = true; - - while (canLoop) + while (true) { - (bool loop, SymbolicRegexNode next) = current switch + if (node._info.ContainsSomeAnchor) { - // Bail if it contains any anchors. (This could potentially be a very good future optimization for - // anchors but there's too many edge cases to guarantee it works. - // one example which fails currently: pattern: @"\By\b", input: "xy") - { _info.ContainsSomeAnchor: true } => Bail(), - - // if this is reached then entire match is fixed length - { _kind: SymbolicRegexNodeKind.CaptureStart} => (false, _builder.Epsilon), - - { _kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.CaptureEnd } => (true, current._right!), - - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.BoundaryAnchor } => (true, current._right!), - - {_kind:SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Singleton} => AddSingleton(current), - - {_kind: SymbolicRegexNodeKind.Concat, _left._kind: SymbolicRegexNodeKind.Loop } => - AddFixedLengthLoop(current), - - _ => (false, current) - }; - canLoop = loop; - current = next; - } - - - return - pos <= 0 ? new MatchReversal(MatchReversalKind.MatchStart, 0) : - current == _builder.Epsilon ? new MatchReversal(MatchReversalKind.FixedLength, pos) : - new MatchReversal(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(current), 0)); + // Bail if it contains any anchors as it invalidates the optimization. + // (This could potentially be a very good future optimization for anchors but there's too many edge cases to guarantee it works. + // One example which fails currently: pattern: @"\By\b", input: "xy") + pos = 0; + break; + } - // finding anchors inside pattern invalidates this optimization - (bool, SymbolicRegexNode) Bail() - { - pos = 0; - // return original node - return (false, node); - } + if (node._kind is not SymbolicRegexNodeKind.Concat) + { + if (node._kind is SymbolicRegexNodeKind.CaptureStart) + { + node = _builder.Epsilon; // The entire match is fixed length. + } + break; + } - (bool, SymbolicRegexNode) AddSingleton(SymbolicRegexNode concatNode) - { - pos += 1; - // continue with next concat - return (true, concatNode._right!); - } + SymbolicRegexNode? left = node._left; + Debug.Assert(left is not null); - (bool, SymbolicRegexNode) AddFixedLengthLoop(SymbolicRegexNode concatNode) - { - SymbolicRegexNode? loopNode = concatNode._left; - if (loopNode is { _lower: <= 0 }) + if (left._kind is SymbolicRegexNodeKind.CaptureEnd or SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.Singleton) { - return (false, concatNode); + node = node._right!; + if (left._kind is SymbolicRegexNodeKind.Singleton) + { + pos++; + } } + else if (left._kind is SymbolicRegexNodeKind.Loop) + { + if (left._lower <= 0 || left._left!.Kind is not SymbolicRegexNodeKind.Singleton) + { + break; + } - switch (loopNode!._left!.Kind) + node = left._lower == left._upper ? + node._right! : // The entire loop is fixed + _builder.CreateConcat( // Subtract the fixed part of the loop. + _builder.CreateLoop(left._left, left.IsLazy, 0, left._upper - left._lower), + node._right!); + pos += left._lower; + } + else { - case SymbolicRegexNodeKind.Singleton: - - if (loopNode._lower == loopNode._upper) - { - pos += loopNode._lower; - // the entire loop is fixed, continue - return (true, concatNode._right!); - } - - // subtract the fixed part of the loop - int loopRemainder = loopNode._upper - loopNode._lower; - SymbolicRegexNode newLeft = - _builder.CreateLoop(loopNode._left, loopNode.IsLazy, 0, loopRemainder); - SymbolicRegexNode newNode = _builder.CreateConcat(newLeft, concatNode._right!); - pos += loopNode._lower; - return (true, newNode); - default: - return (false, concatNode); + break; } } + + Debug.Assert(pos >= 0); + return + pos == 0 ? new MatchReversalInfo(MatchReversalKind.MatchStart, 0) : + node == _builder.Epsilon ? new MatchReversalInfo(MatchReversalKind.FixedLength, pos) : + new MatchReversalInfo(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(node), 0)); } /// @@ -299,7 +270,7 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node } _stateArray[state.Id] = state; _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState); - _nullabilityArray[state.Id] = state.BuildNullabilityInfo(); + _nullabilityArray[state.Id] = (byte)state.NullabilityInfo; } return state; @@ -395,11 +366,8 @@ private bool TryCreateNewTransition( MatchingState? targetState = _stateArray[_dfaDelta[offset]]; if (targetState is null) { - if (// check if there is an active timer - (timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || - // check if amount of nodes exceeds the NFA threshold - (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold) - ) + if ((timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || // if there's an active timer + (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)) // if # of nodes exceeds the NFA threshold { nextState = null; return false; @@ -438,7 +406,7 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse SymbolicRegexNode targetNode = coreTargetId > 0 ? GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind); - List targetsList = new(); + List targetsList = []; ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List targetsList) => targetsList.Add(nfaId)); @@ -465,8 +433,9 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse TSet minterm = GetMintermFromId(mintermId); uint nextCharKind = GetPositionKind(mintermId); List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind); + // Build the new state and store it into the array. - List<(int, DerivativeEffect[])> targetsList = new(); + List<(int, DerivativeEffect[])> targetsList = []; foreach ((SymbolicRegexNode Node, DerivativeEffect[] Effects) entry in transition) { ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects), diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index b80314c7428400..14372bf31e34cc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Buffers; using System.Collections.Generic; using System.Diagnostics; using System.IO; @@ -84,17 +83,16 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// /// Dead end state to quickly return NoMatch. - /// This could potentially be a constant if it's the very first state created /// private readonly int _deadStateId; - /// Initial state used for vectorization + /// Initial state used for vectorization. private readonly int _initialStateId; - /// Whether the pattern contains any anchor + /// Whether the pattern contains any anchor. private readonly bool _containsAnyAnchor; - /// Whether the pattern contains the EndZ anchor, which makes most optimization shortcuts invalid + /// Whether the pattern contains the EndZ anchor, which invalidates most optimization shortcuts. private readonly bool _containsEndZAnchor; /// The initial states for the original pattern, keyed off of the previous character kind. @@ -109,10 +107,8 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _reverseInitialStates; - /// - /// Reversal state which skips fixed length parts. - /// - private readonly MatchReversal _optimizedReversalState; + /// Details on optimized processing of the reverse of the pattern to find the beginning of a match. + private readonly MatchReversalInfo _optimizedReversalInfo; /// Partition of the input space of sets. private readonly TSet[] _minterms; @@ -190,8 +186,7 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo ((BitVectorSolver)(object)builder._solver)._classifier; _capsize = captureCount; - - // Initialization for fields in SymbolicRegexMatcher.Automata.cs + // Initialize state and nullability arrays. _stateArray = new MatchingState[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; _nullabilityArray = new byte[InitialDfaStateCapacity]; @@ -206,8 +201,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId); } - // Create optimized reversal - _optimizedReversalState = CreateOptimizedReversal(_pattern.Reverse(builder)); + // Gather optimized reversal processing information. + _optimizedReversalInfo = CreateOptimizedReversal(_pattern.Reverse(builder)); // Store the find optimizations that can be used to jump ahead to the next possible starting location. // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's @@ -251,9 +246,8 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo } _dotstarredInitialStates = dotstarredInitialStates; - // Assign dead state id + // Assign dead and initial state ids _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id; - // Assign initial state id _initialStateId = _dotstarredInitialStates[CharKind.General].Id; // Create the reverse pattern (the original pattern in reverse order) and all of its @@ -378,38 +372,25 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. - // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases + // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases. int matchEnd; - if (!(_containsEndZAnchor || _mintermClassifier.IntLookup() is not null)) + if (!_containsEndZAnchor && _mintermClassifier.IntLookup is null) { + // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { - (true, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true) => - FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else { - // fallback for Z anchor or over 255 minterms - matchEnd = (_findOpts is not null) switch - { - true => - FindEndPositionFallback( - input, startat, timeoutOccursAt, mode, perThreadData), - false => - FindEndPositionFallback( - input, startat, timeoutOccursAt, mode, perThreadData), - }; + // Fallback for Z anchor or over 255 minterms + matchEnd = _findOpts is not null ? + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); } // If there wasn't a match, we're done. @@ -431,60 +412,57 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that // exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from // that last b until it finds the 4th a: aaabbbc. - int matchStart; + int matchStart = 0; Debug.Assert(matchEnd >= startat - 1); - switch (_optimizedReversalState.Kind) + switch (_optimizedReversalInfo.Kind) { - case MatchReversalKind.FixedLength: - matchStart = (matchEnd - _optimizedReversalState.FixedLength); - break; - case MatchReversalKind.MatchStart: case MatchReversalKind.PartialFixedLength: int initialLastStart = -1; // invalid sentinel value int i = matchEnd; CurrentState reversalStartState; - if (_optimizedReversalState.Kind == MatchReversalKind.PartialFixedLength) + + if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart) + { + // No fixed-length knowledge. Start at the end of the match. + reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); + } + else { - i -= _optimizedReversalState.FixedLength; - reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!); + // There's a fixed-length portion at the end of the match. Start just before it. + i -= _optimizedReversalInfo.FixedLength; + reversalStartState = new CurrentState(_optimizedReversalInfo.AdjustedStartState!); // reversal may already be nullable here in the case of anchors - if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0) - { - if (FullNullabilityHandler.IsNullableAt(this, - in reversalStartState, FullInputReader.GetPositionId(this, input, i), + if (_containsAnyAnchor && + _nullabilityArray[reversalStartState.DfaStateId] > 0 && + FullNullabilityHandler.IsNullableAt( + this, in reversalStartState, FullInputReader.GetPositionId(this, input, i), DfaStateHandler.GetStateFlags(this, in reversalStartState))) - { - initialLastStart = i; - } + { + initialLastStart = i; } } - else - { - reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); - } - matchStart = matchEnd < startat - ? startat - : (_containsEndZAnchor, _containsAnyAnchor) switch + matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch { - (true, true) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), - (true, false) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), - (false, true) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), - (false, false) => - FindStartPosition( - reversalStartState, initialLastStart, input, i, startat, perThreadData), + // Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization; + // the (true, true) case is functionally complete whereas the (false, false) case is the most optimized. + (true, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (true, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), }; break; + + case MatchReversalKind.FixedLength: + // The whole match is known to be of a fixed length, so we don't need to do any processing to find its beginning, just jump there. + matchStart = matchEnd - _optimizedReversalInfo.FixedLength; + break; + default: - throw new ArgumentOutOfRangeException(); + Debug.Fail($"Unexpected reversal kind: {_optimizedReversalInfo.Kind}"); + break; } // Phase 3: @@ -513,8 +491,7 @@ private int FindEndPositionOptimized(input, pos - 1)]); int endPos = NoMatchExists; @@ -527,26 +504,19 @@ private int FindEndPositionOptimized DfaCharsPerTimeoutCheck - ? pos + DfaCharsPerTimeoutCheck - : lengthMinus1; - done = - FindEndPositionDeltasDFAOptimized< - TAcceleratedStateHandler, - TOptimizedNullabilityHandler>(input, innerLoopLength, mode, timeoutOccursAt, ref pos, - ref currentState.DfaStateId, ref endPos); + innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1; + done = FindEndPositionDeltasDFAOptimized( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState.DfaStateId, ref endPos); } else { - // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here - const int NfaCharsPerTimeoutCheck = 1000; - innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck - ? pos + NfaCharsPerTimeoutCheck - : input.Length; - done = - FindEndPositionDeltasNFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, - ref initialStatePosCandidate, ref initialStatePosCandidate); + // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here and it's not worth special-casing. + const int NfaCharsPerTimeoutCheck = 1_000; + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -611,27 +581,18 @@ private int FindEndPositionFallback DfaCharsPerTimeoutCheck - ? pos + DfaCharsPerTimeoutCheck - : input.Length; - done = - FindEndPositionDeltasDFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePosCandidate); + const int DfaCharsPerTimeoutCheck = 25_000; + innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasDFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } else { - // nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here - // worst case NFA speed is about 150 kb/s, this means the check is about every 13ms - const int NfaCharsPerTimeoutCheck = 1000; - innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck - ? pos + NfaCharsPerTimeoutCheck - : input.Length; - done = - FindEndPositionDeltasNFA(input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, - ref endStateId, ref initialStatePosCandidate); + // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here. + const int NfaCharsPerTimeoutCheck = 1_000; + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -665,18 +626,17 @@ private int FindEndPositionFallback /// This version of uses a different set of interfaces, - /// which don't check for many inner loop edge cases e.g. input end or '\n'. + /// which don't check for many inner loop edge cases, e.g. input end or '\n'. /// All edge cases are handled before entering the loop. /// - private bool FindEndPositionDeltasDFAOptimized(ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFAOptimized( + ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) where TAcceleratedStateHandler : struct, IAcceleratedStateHandler where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler { - // initial check for input end to get it out of the loop + // Initial check for input end lifted out of the subsequent hot-path loop. if (posRef == input.Length) - { if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0])) { @@ -688,12 +648,12 @@ private int FindEndPositionFallback= lengthMinus1) { if (pos + 1 < input.Length) @@ -755,6 +712,7 @@ private int FindEndPositionFallback(this, input, ref state, ref pos)) { @@ -827,8 +787,7 @@ private bool FindEndPositionDeltasDFA(this, in state, - positionId, TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { endPos = pos; @@ -1178,7 +1137,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, } Debug.Assert(current.Count > 0); - foreach (var (endStateId, endRegisters) in current.Values) + foreach ((int endStateId, Registers endRegisters) in current.Values) { MatchingState endState = GetState(GetCoreStateId(endStateId)); if (endState.IsNullableFor(GetCharKind(input, iEnd))) @@ -1194,6 +1153,16 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, return default; } + /// Look up the min term ID for the character at the specified position in the input. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int GetMintermId(byte[] mintermLookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos >= 0 && pos < input.Length); + + char c = input[pos]; + return c < (uint)mintermLookup.Length ? mintermLookup[c] : 0; + } + /// Stores additional data for tracking capture start and end positions. /// The NFA simulation based third phase has one of these for each current state in the current set of live states. internal struct Registers(int[] captureStarts, int[] captureEnds) @@ -1442,8 +1411,8 @@ internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, re /// - whether this state may be contextually nullable /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) - => matcher._stateFlagsArray[state.DfaStateId]; + public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) => + matcher._stateFlagsArray[state.DfaStateId]; } /// An for operating over instances configured as NFA states. @@ -1594,16 +1563,16 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher< [MethodImpl(MethodImplOptions.AggressiveInlining)] public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) { - SparseIntMap stateSet = state.NfaState!.NfaStateSet; // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if // they are true for any state in the set; SimulatesBacktracking is true for all the states if // it is true for any state (since it is a phase-wide property); and all other flags are masked out. StateFlags flags = 0; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(stateSet.Values)) + foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)]; } + return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag); } @@ -1628,41 +1597,20 @@ public static void UndoTransition(ref CurrentState state) #endif } - - - // /// - // /// This reader maps all characters > maxChar to 0 - // /// - private readonly struct OptimizedSmallInputReader - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan input, int pos) - { - Debug.Assert(pos < input.Length, "pos < input.Length"); - Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}"); - char c = input[pos]; - return c < (uint)lookup.Length ? lookup[c] : 0; - } - } - - /// - /// This nullability handler interface can be used in DFAs - /// for patterns that do not contain \Z - /// + /// This nullability handler interface can be used in DFAs for patterns that do not contain \Z. private interface IOptimizedNullabilityHandler { - public static abstract bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, - int pos); + public static abstract bool IsNullable( + SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, + byte[] lookup, ReadOnlySpan input, int pos); } private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, - int maxChar, ReadOnlySpan input, int pos) + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) { - Debug.Assert(pos < input.Length, "input end should not be handled here"); + Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); return nullabilityArray[currStateId] > 0; } @@ -1671,15 +1619,18 @@ public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabi private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, - byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan input, int pos) + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) { - Debug.Assert(pos < input.Length, "input end should not be handled here"); + Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); - return - nullabilityArray[currStateId] > 0 && - matcher.IsNullableWithContext(currStateId, - input[pos] < (uint)lookup.Length ? lookup[input[pos]] : 0); + + if (nullabilityArray[currStateId] > 0) + { + char c = input[pos]; + return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0); + } + + return false; } } @@ -1695,13 +1646,11 @@ private interface IInputReader public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); } - - /// This reader omits the special handling of \n for the \Z anchor. private readonly struct NoZAnchorInputReader : IInputReader { public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => - (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]); + (uint)pos < (uint)input.Length ? matcher._mintermClassifier.GetMintermID(input[pos]) : -1; } /// This reader includes full handling of an \n as the last character of input for the \Z anchor. @@ -1721,7 +1670,6 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan } } - private interface IInitialStateHandler { public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, @@ -1750,17 +1698,17 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche return false; } - if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - return true; + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; } - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; return true; } } + private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1769,20 +1717,21 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) { if (currentStateId != initialStateId) + { return false; + } if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - currentStateId = matcher._dotstarredInitialStates[ - matcher._positionKinds[ - OptimizedSmallInputReader.GetPositionId(lookup, lookup.Length + 1, input, pos - 1) + 1] - ].Id; - return true; + currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input, pos - 1) + 1]].Id; + } + else + { + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; } - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; return true; } } @@ -1790,26 +1739,18 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - byte[] lookup, - ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) - { - return false; - } + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, byte[] lookup, ReadOnlySpan input, ref int currentStateId, ref int pos, int initialStateId) => + false; } - /// - /// No-op handler for when there are no initial state optimizations to apply. - /// + /// No-op handler for when there are no initial state optimizations to apply. private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader - { - // return true to indicate that the current position is a possible starting position - return true; - } + where TInputReader : struct, IInputReader => + true; // the current position is a possible starting position } /// @@ -1822,16 +1763,16 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatche where TInputReader : struct, IInputReader { // Find the first position that matches with some likely character. - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - // No match exists - return false; + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // As with the initial starting state, if it's a dead end, no match exists. + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + return true; } - // Update the starting state based on where TryFindNextStartingPosition moved us to. - // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); - return true; + // No match exists + return false; } } @@ -1841,7 +1782,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatche private interface INullabilityHandler { public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler; + where TStateHandler : struct, IStateHandler; } /// @@ -1865,10 +1806,9 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler - { - return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); - } + where TStateHandler : struct, IStateHandler => + flags.IsNullable() || + (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 4309054c354e6f..5384810092b7fc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -185,7 +185,7 @@ internal bool CanBeNullable public List> ToList(List>? list = null, SymbolicRegexNodeKind listKind = SymbolicRegexNodeKind.Concat) { Debug.Assert(listKind is SymbolicRegexNodeKind.Concat or SymbolicRegexNodeKind.Alternate); - list ??= new List>(); + list ??= []; AppendToList(this, list, listKind); return list; @@ -394,10 +394,11 @@ SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is - SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or - SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor, - kind is SymbolicRegexNodeKind.EndAnchorZ)); + return Create( + builder, kind, null, null, -1, -1, default, + SymbolicRegexInfo.Anchor( + isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor, + isEndZAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ)); } #endregion @@ -541,8 +542,8 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder> seenElems = new(); // Keep track of if any elements from the right side need to be eliminated + HashSet> seenElems = []; bool rightChanged = false; for (int i = 0; i < elems.Count; i++) { @@ -836,7 +837,7 @@ private static bool TryFoldAlternation(SymbolicRegexBuilder builder, Symbo static bool TrySplitConcatSubsumption(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? prefix) { - List> prefixElements = new(); + List> prefixElements = []; SymbolicRegexNode suffix = right; while (suffix._kind == SymbolicRegexNodeKind.Concat) { @@ -1052,7 +1053,7 @@ public SymbolicRegexNode AddFixedLengthMarkers(SymbolicRegexBuilder /// the derivative internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder builder, TSet elem, uint context) { - List<(SymbolicRegexNode, DerivativeEffect[])> transitions = new(); + List<(SymbolicRegexNode, DerivativeEffect[])> transitions = []; CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions); return transitions; } @@ -1085,9 +1086,8 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(SymbolicRegexB return this; // Cache result to avoid otherwise potential quadratic worst case behavior - SymbolicRegexNode? prunedNode; (SymbolicRegexNode, uint) key = (this, context); - if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) + if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out SymbolicRegexNode? prunedNode)) { return prunedNode; } @@ -1254,9 +1254,8 @@ private SymbolicRegexNode CreateDerivative(SymbolicRegexBuilder buil return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context); } - SymbolicRegexNode? derivative; (SymbolicRegexNode, TSet, uint) key = (this, elem, context); - if (builder._derivativeCache.TryGetValue(key, out derivative)) + if (builder._derivativeCache.TryGetValue(key, out SymbolicRegexNode? derivative)) { return derivative; } @@ -1434,7 +1433,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex return; } - currentEffects ??= new List(); + currentEffects ??= []; // If we've reached a node with no effects, then output that with the effects that have been accumulated if (!_info.ContainsEffect) @@ -1469,7 +1468,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { - var (node, effects) = alternativesAndEffects[i]; + (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i]; alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects); } break; @@ -1507,7 +1506,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { - var (node, effects) = alternativesAndEffects[i]; + (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i]; alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects); } break; @@ -1896,12 +1895,8 @@ private void CollectSets(SymbolicRegexBuilder builder, HashSet sets) } /// Compute and sort all the minterms from the sets in this regex. - public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) - { - HashSet sets = GetSets(builder); - List minterms = MintermGenerator.GenerateMinterms(builder._solver, sets); - return minterms.ToArray(); - } + public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) => + MintermGenerator.GenerateMinterms(builder._solver, GetSets(builder)).ToArray(); /// /// Create the reverse of this regex diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index bf7d5a6501699c..5d73a3e232e809 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -17,11 +17,8 @@ internal static class SymbolicRegexThresholds /// an NFA. As an NFA, we instead track all of the states we're in at any given point. /// /// - /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance - /// is currently approx. 50 MB. - /// There is some motivation to make this configurable, as it can exchange upfront costs with potentially - /// significant search-time performance gains. Worst case memory consumption for the regex instance - /// can be approximated to about (NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) + /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance is currently ~50 MB. + /// Worst case memory consumption for the regex instance can be approximated to ~(NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state. /// internal const int NfaNodeCountThreshold = 125_000; @@ -34,8 +31,8 @@ internal static class SymbolicRegexThresholds /// This default value may be overridden with the AppContext data /// whose name is given by . /// - /// This limit is chosen due to worst case NFA speed constraints, which is about 150kb/s, - /// although it could be safely raised higher at the expense of worst-case NFA performance + /// This limit is chosen due to worst case NFA speed constraints, + /// although it could be safely raised higher at the expense of worst-case NFA performance. /// internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 6ad2275f9584b4..1f0e2932c6425d 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -2653,36 +2653,25 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } }; } } -#if NET + [Fact] public async Task MatchNonBacktrackingOver255Minterms() { - // This is a test for the rare over 255 unique minterms case in MintermClassifier - StringBuilder pattern = new(); - StringBuilder input = new(); - for (int i = 128; i <= 400; i++) - { - char c = (char)i; - pattern.Append(c); - // adding an optional char as well just so it's not a string literal - pattern.Append(c); - pattern.Append('?'); - // input is the pattern itself - input.Append(c); - } + // While valid on all engines, this test in particular is designed to exercise the rare case + // of more than 255 unique minterms case in the non-backtracking engine's minterm classifier. - string patternString = pattern.ToString(); - string inputString = input.ToString(); + IEnumerable chars = Enumerable.Range(128, 400 - 128).Select(i => (char)i); + string patternString = string.Concat(chars.Select(c => $"{c}{c}?")); // adding an optional char as well just so it's not a string literal + string inputString = string.Concat(chars); foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - Regex r = await RegexHelpers.GetRegexAsync(engine, patternString, RegexOptions.None); + Regex r = await RegexHelpers.GetRegexAsync(engine, patternString); MatchCollection ms = r.Matches(inputString); Assert.Equal(1, ms.Count); Assert.Equal(0, ms[0].Index); - Assert.Equal(273, ms[0].Length); + Assert.Equal(272, ms[0].Length); } } -#endif } }