Skip to content

Commit 7858a2f

Browse files
committed
reverting some changes
1 parent 8e13e00 commit 7858a2f

File tree

5 files changed

+24
-17
lines changed

5 files changed

+24
-17
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs

+2-8
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
1818
}
1919

2020
/// <summary>
21-
/// TODO: The CLR assigns an entire field for this byte which is a waste,
22-
/// and the much more preferred way to use this is in _nullabilityArray in the matcher
23-
/// but the current design relies on interfaces/flags and
24-
/// using the MatchingState directly so this byte is a quick solution to cheapen
25-
/// it there by ~30% as well without having to breaking it all to pieces
21+
/// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor
2622
/// </summary>
2723
internal readonly int NullabilityInfo;
2824

@@ -106,8 +102,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
106102
}
107103

108104
/// <summary>
109-
/// TODO: This method should really never be used and
110-
/// is only used to speed up the existing architecture.
105+
/// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned
111106
/// Use <see cref="SymbolicRegexMatcher{TSet}.IsNullableWithContext"/>
112107
/// whereever possible
113108
/// </summary>
@@ -166,7 +161,6 @@ internal StateFlags BuildStateFlags(bool isInitial)
166161
/// 00100 -> nullable for NewLine
167162
/// 01000 -> nullable for NewLineS
168163
/// 10000 -> nullable for WordLetter
169-
/// todo: change to flags later
170164
/// </summary>
171165
/// <returns></returns>
172166
internal byte BuildNullabilityInfo()

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs

+3-4
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,8 @@ public MintermClassifier(BDD[] minterms)
4747
return;
4848
}
4949

50-
// low memory compromise is to create an ascii-only array
50+
// ascii-only array to save memory
5151
// int mintermId = c >= 128 ? 0 : mtlookup[c];
52-
// and only exists because the wasm tests fail with OOM
5352
_isAsciiOnly = true;
5453
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
5554
{
@@ -60,8 +59,8 @@ public MintermClassifier(BDD[] minterms)
6059
}
6160
}
6261

63-
// assign minterm category for every char
64-
// unused characters in minterm 0 get mapped to zero
62+
// i have never seen a regex use over 80 minterms not to speak of 255,
63+
// but it's there as a fallback mechanism
6564
if (minterms.Length > 255)
6665
{
6766
// over 255 unique sets also means it's never ascii only

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
532532
// /// <summary>
533533
// /// attempt to remove anchors when possible since it reduces overhead
534534
// /// more rewrites could be tried but it's important to preserve PCRE semantics
535-
// /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner
535+
// /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner
536536
// /// </summary>
537537
// /// <param name="builder"></param>
538538
// /// <param name="rootNode"></param>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs

+16-2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
8181
/// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
8282
private readonly RegexFindOptimizations? _findOpts;
8383

84-
/// <summary>Dead end state to quickly return NoMatch</summary>
84+
/// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
8585
private readonly int _deadStateId;
8686

8787
/// <summary>Whether the pattern contains any anchor</summary>
@@ -102,6 +102,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
102102
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
103103
private readonly MatchingState<TSet>[] _reverseInitialStates;
104104

105+
/// <summary>
106+
/// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state.
107+
/// </summary>
105108
private readonly (int, MatchingState<TSet>) _optimizedReversalState;
106109

107110
/// <summary>Partition of the input space of sets.</summary>
@@ -328,6 +331,8 @@ uint CalculateMintermIdKind(int mintermId)
328331
/// </summary>
329332
internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);
330333

334+
/// TODO: when you're calling a function millions of times per second even this add 1 does cost something
335+
/// this should be ideally remapped
331336
/// <summary>Look up what is the character kind given a position ID</summary>
332337
[MethodImpl(MethodImplOptions.AggressiveInlining)]
333338
private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];
@@ -351,6 +356,7 @@ internal TSet GetMintermFromId(int mintermId)
351356
return minterms[mintermId];
352357
}
353358

359+
/// <summary>TODO: this if-else branch could be called once. it's currently causing overhead on every single step</summary>
354360
[MethodImpl(MethodImplOptions.AggressiveInlining)]
355361
private uint GetCharKind<TInputReader>(ReadOnlySpan<char> input, int i)
356362
where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ?
@@ -657,6 +663,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
657663
/// TODO: this is essentially a stripped down version when there's no good prefix optimizations
658664
/// i don't trust the compiler to optimize this and it makes a
659665
/// ~50% difference in performance with removing unnecessary checks alone
666+
///
660667
/// </summary>
661668
private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
662669
ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
@@ -668,9 +675,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
668675
byte[] mtlookup = _mintermClassifier.ByteLookup()!;
669676
int endStateId = endStateIdRef;
670677
int currStateId = startStateId;
678+
// ldfld only once
679+
// int deadStateId = _deadStateId;
671680
try
672681
{
673682
// Loop through each character in the input, transitioning from state to state for each.
683+
// The goal is to make this loop as fast as it can possible be,
684+
// every single piece of overhead should be removed here
685+
// there should be not a single callvirt instruction in the loop
686+
// ldfld only if necessary (e.g. a reference changes)
687+
// no memory writes unless necessary
674688
while (true)
675689
{
676690
if (currStateId == _deadStateId)
@@ -783,7 +797,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
783797

784798
// If the state is nullable for the next character, meaning it accepts the empty string,
785799
// we found a potential end state.
786-
if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
800+
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
787801
{
788802
endPos = pos;
789803
endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ internal static class SymbolicRegexThresholds
2626
/// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s
2727
/// with an entirely different search-time algorithmic complexity
2828
/// 100_000 isn't a really a high memory cost either,
29-
/// i'd even put 1_000_000 on the table but that might push it for general purpose use
29+
/// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has
3030
/// </remarks>
3131
internal const int NfaThreshold = 100_000;
3232

3333
/// <summary>
3434
/// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
3535
/// after the AST has been anlayzed for safe handling.
36-
/// TODO: this is perhaps too conservative, consider raising this
36+
/// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too
3737
/// <remarks>
3838
/// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
3939
/// This default value may be overridden with the AppContext data

0 commit comments

Comments
 (0)