Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NonBacktracking inner matching loop optimizations #70217

Merged
merged 6 commits into from
Jun 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ internal DfaMatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)

internal int Id { get; set; }

internal bool IsInitialState { get; set; }

/// <summary>This is a deadend state</summary>
internal bool IsDeadend => Node.IsNothing;

Expand Down Expand Up @@ -130,7 +128,7 @@ internal DfaMatchingState<TSet> Next(TSet minterm)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullable(uint nextCharKind)
internal bool IsNullableFor(uint nextCharKind)
{
Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
uint context = CharKind.Context(PrevCharKind, nextCharKind);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,54 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>,
/// </summary>
internal DfaMatchingState<TSet>[]? _stateArray;
internal DfaMatchingState<TSet>[]? _capturingStateArray;

/// <summary>
/// Maps state IDs to context-independent information for all states in <see cref="_stateArray"/>.
/// </summary>
internal byte[]? _stateInfo;

// Bit masks for decoding elements of _stateInfo
private const int isInitialMask = 0b0001;
private const int isDeadendMask = 0b0010;
private const int isNullableMask = 0b0100;
private const int canBeNullableMask = 0b1000;
stephentoub marked this conversation as resolved.
Show resolved Hide resolved

/// <summary>Assign the context-independent information for the given state.</summary>
internal void SetStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable)
{
Debug.Assert(_stateInfo is not null);
byte info = 0;
if (isInitial)
info |= isInitialMask;
if (isDeadend)
info |= isDeadendMask;
if (isNullable)
info |= isNullableMask;
if (canBeNullable)
info |= canBeNullableMask;
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
_stateInfo[stateId] = info;
}

/// <summary>Get context-independent information for the given state.</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId)
{
Debug.Assert(_stateInfo is not null);
byte info = _stateInfo[stateId];
return (
(info & isInitialMask) != 0,
(info & isDeadendMask) != 0,
(info & isNullableMask) != 0,
(info & canBeNullableMask) != 0);
}

/// <remarks>
/// For these "delta" arrays, technically Volatile.Read should be used to read out an element,
/// but in practice that's not needed on the runtimes in use (though that needs to be documented
/// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is
/// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789).
/// </remarks>
internal DfaMatchingState<TSet>?[]? _delta;
internal int[]? _delta;
internal List<(DfaMatchingState<TSet>, DerivativeEffect[])>?[]? _capturingDelta;
private const int InitialStateLimit = 1024;

Expand Down Expand Up @@ -170,10 +211,11 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
{
_stateArray = new DfaMatchingState<TSet>[InitialStateLimit];
_capturingStateArray = new DfaMatchingState<TSet>[InitialStateLimit];
_stateInfo = new byte[InitialStateLimit];

// the extra +1 slot with id minterms.Length is reserved for \Z (last occurrence of \n)
_mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1;
_delta = new DfaMatchingState<TSet>[InitialStateLimit << _mintermsLog];
_delta = new int[InitialStateLimit << _mintermsLog];
_capturingDelta = new List<(DfaMatchingState<TSet>, DerivativeEffect[])>[InitialStateLimit << _mintermsLog];
}

Expand Down Expand Up @@ -208,10 +250,10 @@ internal TSet GetMinterm(int mintermId)
}

/// <summary>Returns the span from <see cref="_delta"/> that may contain transitions for the given state</summary>
internal Span<DfaMatchingState<TSet>?> GetDeltasFor(DfaMatchingState<TSet> state)
internal Span<int> GetDeltasFor(DfaMatchingState<TSet> state)
{
if (_delta is null || _minterms is null)
return Span<DfaMatchingState<TSet>?>.Empty;
return Span<int>.Empty;
int numMinterms = state.StartsWithLineAnchor ? _minterms.Length + 1 : _minterms.Length;
return _delta.AsSpan(state.Id << _mintermsLog, numMinterms);
}
Expand Down Expand Up @@ -453,8 +495,9 @@ internal SymbolicRegexNode<TNewSet> Transform<TNewSet>(SymbolicRegexNode<TSet> n
/// <param name="node">the pattern that this state will represent</param>
/// <param name="prevCharKind">the kind of the character that led to this state</param>
/// <param name="capturing">whether to use the separate space of states with capturing transitions or not</param>
/// <param name="isInitialState">whether to mark the state as an initial state or not</param>
/// <returns></returns>
public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint prevCharKind, bool capturing = false)
public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint prevCharKind, bool capturing = false, bool isInitialState = false)
{
//first prune the anchors in the node
TSet wlbSet = _wordLetterForBoundariesSet;
Expand All @@ -469,21 +512,21 @@ public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint pre
var s = new DfaMatchingState<TSet>(pruned_node, prevCharKind);
if (!(capturing ? _capturingStateCache : _stateCache).TryGetValue(s, out DfaMatchingState<TSet>? state))
{
state = MakeNewState(s, capturing);
state = MakeNewState(s, capturing, isInitialState);
}

return state;
}

private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool capturing)
private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool capturing, bool isInitialState)
{
lock (this)
{
HashSet<DfaMatchingState<TSet>> cache = capturing ? _capturingStateCache : _stateCache;
cache.Add(state); // Add to cache first to make 1 the first state ID
state.Id = cache.Count;
cache.Add(state);

Debug.Assert(_stateArray is not null && _capturingStateArray is not null);
Debug.Assert(_stateArray is not null && _capturingStateArray is not null && _stateInfo is not null);

const int GrowthSize = 1024;
if (capturing)
Expand All @@ -503,8 +546,10 @@ private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool c
int newsize = _stateArray.Length + GrowthSize;
Array.Resize(ref _stateArray, newsize);
Array.Resize(ref _delta, newsize << _mintermsLog);
Array.Resize(ref _stateInfo, newsize);
}
_stateArray[state.Id] = state;
SetStateInfo(state.Id, isInitialState, state.IsDeadend, state.Node.IsNullable, state.Node.CanBeNullable);
}
return state;
}
Expand Down Expand Up @@ -549,13 +594,20 @@ private int MakeNewNfaState(int coreStateId)
}
}

/// <summary>Gets the core state corresponding to the NFA state</summary>
public DfaMatchingState<TSet> GetCoreState(int nfaStateId)
/// <summary>Gets the core state Id corresponding to the NFA state</summary>
public int GetCoreStateId(int nfaStateId)
{
Debug.Assert(_stateArray is not null);
Debug.Assert(nfaStateId < _nfaStateArray.Length);
Debug.Assert(_nfaStateArray[nfaStateId] < _stateArray.Length);
return _stateArray[_nfaStateArray[nfaStateId]];
return _nfaStateArray[nfaStateId];
}

/// <summary>Gets the core state corresponding to the NFA state</summary>
public DfaMatchingState<TSet> GetCoreState(int nfaStateId)
{
Debug.Assert(_stateArray is not null);
return _stateArray[GetCoreStateId(nfaStateId)];
}

/// <summary>Critical region for defining a new core transition</summary>
Expand All @@ -570,13 +622,13 @@ public DfaMatchingState<TSet> CreateNewTransition(DfaMatchingState<TSet> sourceS
public bool TryCreateNewTransition(
DfaMatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out DfaMatchingState<TSet>? nextState)
{
Debug.Assert(_delta is not null);
Debug.Assert(_delta is not null && _stateArray is not null);
lock (this)
{
Debug.Assert(offset < _delta.Length);

// check if meanwhile delta[offset] has become defined possibly by another thread
DfaMatchingState<TSet>? targetState = _delta[offset];
DfaMatchingState<TSet>? targetState = _stateArray[_delta[offset]];
if (targetState is null)
{
if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
Expand All @@ -586,7 +638,7 @@ public bool TryCreateNewTransition(
}

targetState = sourceState.Next(GetMinterm(mintermId));
Volatile.Write(ref _delta[offset], targetState);
Volatile.Write(ref _delta[offset], targetState.Id);
}

nextState = targetState;
Expand All @@ -597,7 +649,7 @@ public bool TryCreateNewTransition(
/// <summary>Gets or creates a new NFA transition.</summary>
public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset)
{
Debug.Assert(_delta is not null);
Debug.Assert(_delta is not null && _stateArray is not null);
lock (this)
{
Debug.Assert(nfaOffset < _nfaDelta.Length);
Expand All @@ -609,7 +661,9 @@ public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset
// Create the underlying transition from the core state corresponding to the nfa state
DfaMatchingState<TSet> coreState = GetCoreState(nfaStateId);
int coreOffset = (coreState.Id << _mintermsLog) | mintermId;
DfaMatchingState<TSet>? coreTarget = _delta[coreOffset] ?? CreateNewTransition(coreState, mintermId, coreOffset);
int coreTargetId = _delta[coreOffset];
DfaMatchingState<TSet>? coreTarget = coreTargetId > 0 ?
_stateArray[coreTargetId] : CreateNewTransition(coreState, mintermId, coreOffset);

SymbolicRegexNode<TSet> node = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ?
coreTarget.Node._left! : coreTarget.Node;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public override void SaveDGML(TextWriter writer, int maxLabelLength)
foreach (DfaMatchingState<TSet> state in _builder._stateCache)
{
writer.WriteLine(" <Node Id=\"{0}\" Label=\"{0}\" Category=\"State\" Group=\"Collapsed\" StateInfo=\"{1}\">", state.Id, state.DgmlView);
if (state.IsInitialState)
if (_builder.GetStateInfo(state.Id).IsInitial)
{
writer.WriteLine(" <Category Ref=\"InitialState\" />");
}
Expand Down Expand Up @@ -143,16 +143,17 @@ public override void SaveDGML(TextWriter writer, int maxLabelLength)
foreach (DfaMatchingState<TSet> source in builder._stateCache)
{
// Get the span of entries in delta that gives the transitions for the different minterms
Span<DfaMatchingState<TSet>?> deltas = builder.GetDeltasFor(source);
Span<int> deltas = builder.GetDeltasFor(source);
Span<int[]?> nfaDeltas = builder.GetNfaDeltasFor(source);
Debug.Assert(deltas.Length == builder._minterms.Length);
for (int i = 0; i < deltas.Length; ++i)
{
// null entries are transitions not explored yet, so skip them
if (deltas[i] is DfaMatchingState<TSet> target)
// negative entries are transitions not explored yet, so skip them
int targetId = deltas[i];
if (targetId >= 0)
{
// Get or create the data for this (source,destination) state ID pair
(int Source, int Target) key = (source.Id, target.Id);
(int Source, int Target) key = (source.Id, targetId);
if (!result.TryGetValue(key, out (TSet Rule, List<int> NfaTargets) entry))
{
entry = (builder._solver.Empty, new List<int>());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,25 @@ public override IEnumerable<string> SampleMatches(int k, int randomseed)
{
// Unconditionally final state or end of the input due to \Z anchor for example
if (NfaStateHandler.IsNullable(ref statesWrapper) ||
NfaStateHandler.IsNullable(ref statesWrapper, CharKind.BeginningEnd))
NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
{
possibleEndings.Add("");
}

// End of line due to end-of-line anchor
if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.Newline))
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
{
possibleEndings.Add("\n");
}

// Related to wordborder due to \b or \B
if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.WordLetter))
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
{
possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
}

// Related to wordborder due to \b or \B
if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.General))
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
{
possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
}
Expand All @@ -125,7 +125,7 @@ public override IEnumerable<string> SampleMatches(int k, int randomseed)
}

// Shuffle the minterms, including the last end-of-line marker if appropriate
int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(ref statesWrapper) ?
int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
Shuffle(random, mintermIdsWithZ) :
Shuffle(random, mintermIdsWithoutZ);
foreach (int mintermId in mintermIds)
Expand Down
Loading