From 1e48eca9577433526c5c649a87ca2f2545d35958 Mon Sep 17 00:00:00 2001 From: Margus Veanes Date: Wed, 3 Nov 2021 16:55:39 -0700 Subject: [PATCH] lock protect nullability cache of symbolic regex node (#60942) * fixed nullability checking to be threadsafe * use volatile write for nullability cache Co-authored-by: Stephen Toub * made read of nullability cache volatile and fixed other PR comments * made non-lock-protected reads from SymbolicRegexBuilder._delta volatile * Apply suggestions from code review Co-authored-by: Stephen Toub --- .../RegularExpressions/Symbolic/CharKind.cs | 5 +- .../Symbolic/SymbolicRegexMatcher.cs | 6 +- .../Symbolic/SymbolicRegexNode.cs | 149 ++++++++++-------- .../tests/Regex.Match.Tests.cs | 28 ++++ .../tests/RegexExperiment.cs | 5 +- 5 files changed, 121 insertions(+), 72 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs index 39f95920da31b..1952a7dcd1e00 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/CharKind.cs @@ -29,9 +29,12 @@ internal static class CharKind /// Gets the next character kind from a context internal static uint Next(uint context) => context >> 3; - /// Creates the context of the previous and the next character kinds. + /// Encodes the pair (prevKind, nextKind) using 6 bits internal static uint Context(uint prevKind, uint nextKind) => (nextKind << 3) | prevKind; + /// Exclusive maximum context (limit) is 64 because a context uses bit-shifting where each kind needs 3 bits. + internal const int ContextLimit = 64; + internal static string DescribePrev(uint i) => i switch { StartStop => @"\A", diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index ac56fd972d0cf..a8cec12036225 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -359,9 +359,7 @@ public DfaMatchingState TakeTransition( Debug.Assert(builder._delta is not null); int offset = (currentState.Id << builder._mintermsCount) | mintermId; - return - builder._delta[offset] ?? - matcher.CreateNewTransition(currentState, minterm, offset); + return Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(currentState, minterm, offset); } } @@ -391,7 +389,7 @@ public DfaMatchingState TakeTransition( DfaMatchingState nextStates = builder.MkState(oneState, currentStates.PrevCharKind); int offset = (nextStates.Id << builder._mintermsCount) | mintermId; - DfaMatchingState p = builder._delta[offset] ?? matcher.CreateNewTransition(nextStates, minterm, offset); + DfaMatchingState p = Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(nextStates, minterm, offset); // Observe that if p.Node is an Or it will be flattened. union = builder.MkOr2(union, p.Node); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index 41dfdf3898bf5..bf46baf57c147 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -13,6 +13,12 @@ namespace System.Text.RegularExpressions.Symbolic internal sealed class SymbolicRegexNode where S : notnull { internal const string EmptyCharClass = "[]"; + /// Some byte other than 0 to represent true + internal const byte TrueByte = 1; + /// Some byte other than 0 to represent false + internal const byte FalseByte = 2; + /// The undefined value is the default value 0 + internal const byte UndefinedByte = 0; internal readonly SymbolicRegexBuilder _builder; internal readonly SymbolicRegexKind _kind; @@ -23,7 +29,11 @@ internal sealed class SymbolicRegexNode where S : notnull internal readonly SymbolicRegexNode? _right; internal readonly SymbolicRegexSet? _alts; - private Dictionary? _nullabilityCache; + /// + /// Caches nullability of this node for any given context (0 <= context < ContextLimit) + /// when _info.StartsWithSomeAnchor and _info.CanBeNullable are true. Otherwise the cache is null. + /// + private byte[]? _nullabilityCache; private S _startSet; @@ -50,6 +60,7 @@ private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexKind kin _info = info; _hashcode = ComputeHashCode(); _startSet = ComputeStartSet(); + _nullabilityCache = info.StartsWithSomeAnchor && info.CanBeNullable ? new byte[CharKind.ContextLimit] : null; } private bool _isInternalizedUnion; @@ -162,92 +173,100 @@ static void AppendToList(SymbolicRegexNode concat, List> /// kind info for previous and next characters internal bool IsNullableFor(uint context) { - if (!_info.StartsWithSomeAnchor) - return IsNullable; - - if (!_info.CanBeNullable) - return false; + if (_nullabilityCache is null) + { + // if _nullabilityCache is null then IsNullable==CanBeNullable + // Observe that if IsNullable==true then CanBeNullable==true. + // but when the node does not start with an anchor + // and IsNullable==false then CanBeNullable==false. + return _info.IsNullable; + } if (!StackHelper.TryEnsureSufficientExecutionStack()) { return StackHelper.CallOnEmptyStack(IsNullableFor, context); } - // Initialize the nullability cache for this node. - _nullabilityCache ??= new Dictionary(); + Debug.Assert(context < CharKind.ContextLimit); - if (!_nullabilityCache.TryGetValue(context, out bool is_nullable)) + // If nullablity has been computed for the given context then return it + byte b = Volatile.Read(ref _nullabilityCache[context]); + if (b != UndefinedByte) { - switch (_kind) - { - case SymbolicRegexKind.Loop: - Debug.Assert(_left is not null); - is_nullable = _lower == 0 || _left.IsNullableFor(context); - break; + return b == TrueByte; + } - case SymbolicRegexKind.Concat: - Debug.Assert(_left is not null && _right is not null); - is_nullable = _left.IsNullableFor(context) && _right.IsNullableFor(context); - break; + // Otherwise compute the nullability recursively for the given context + bool is_nullable; + switch (_kind) + { + case SymbolicRegexKind.Loop: + Debug.Assert(_left is not null); + is_nullable = _lower == 0 || _left.IsNullableFor(context); + break; - case SymbolicRegexKind.Or: - case SymbolicRegexKind.And: - Debug.Assert(_alts is not null); - is_nullable = _alts.IsNullableFor(context); - break; + case SymbolicRegexKind.Concat: + Debug.Assert(_left is not null && _right is not null); + is_nullable = _left.IsNullableFor(context) && _right.IsNullableFor(context); + break; - case SymbolicRegexKind.Not: - Debug.Assert(_left is not null); - is_nullable = !_left.IsNullableFor(context); - break; + case SymbolicRegexKind.Or: + case SymbolicRegexKind.And: + Debug.Assert(_alts is not null); + is_nullable = _alts.IsNullableFor(context); + break; - case SymbolicRegexKind.StartAnchor: - is_nullable = CharKind.Prev(context) == CharKind.StartStop; - break; + case SymbolicRegexKind.Not: + Debug.Assert(_left is not null); + is_nullable = !_left.IsNullableFor(context); + break; - case SymbolicRegexKind.EndAnchor: - is_nullable = CharKind.Next(context) == CharKind.StartStop; - break; + case SymbolicRegexKind.StartAnchor: + is_nullable = CharKind.Prev(context) == CharKind.StartStop; + break; - case SymbolicRegexKind.BOLAnchor: - // Beg-Of-Line anchor is nullable when the previous character is Newline or Start - // note: at least one of the bits must be 1, but both could also be 1 in case of very first newline - is_nullable = (CharKind.Prev(context) & CharKind.NewLineS) != 0; - break; + case SymbolicRegexKind.EndAnchor: + is_nullable = CharKind.Next(context) == CharKind.StartStop; + break; - case SymbolicRegexKind.EOLAnchor: - // End-Of-Line anchor is nullable when the next character is Newline or Stop - // note: at least one of the bits must be 1, but both could also be 1 in case of \Z - is_nullable = (CharKind.Next(context) & CharKind.NewLineS) != 0; - break; + case SymbolicRegexKind.BOLAnchor: + // Beg-Of-Line anchor is nullable when the previous character is Newline or Start + // note: at least one of the bits must be 1, but both could also be 1 in case of very first newline + is_nullable = (CharKind.Prev(context) & CharKind.NewLineS) != 0; + break; - case SymbolicRegexKind.WBAnchor: - // test that prev char is word letter iff next is not not word letter - is_nullable = ((CharKind.Prev(context) & CharKind.WordLetter) ^ (CharKind.Next(context) & CharKind.WordLetter)) != 0; - break; + case SymbolicRegexKind.EOLAnchor: + // End-Of-Line anchor is nullable when the next character is Newline or Stop + // note: at least one of the bits must be 1, but both could also be 1 in case of \Z + is_nullable = (CharKind.Next(context) & CharKind.NewLineS) != 0; + break; - case SymbolicRegexKind.NWBAnchor: - // test that prev char is word letter iff next is word letter - is_nullable = ((CharKind.Prev(context) & CharKind.WordLetter) ^ (CharKind.Next(context) & CharKind.WordLetter)) == 0; - break; + case SymbolicRegexKind.WBAnchor: + // test that prev char is word letter iff next is not not word letter + is_nullable = ((CharKind.Prev(context) & CharKind.WordLetter) ^ (CharKind.Next(context) & CharKind.WordLetter)) != 0; + break; - case SymbolicRegexKind.EndAnchorZ: - // \Z anchor is nullable when the next character is either the last Newline or Stop - // note: CharKind.NewLineS == CharKind.Newline|CharKind.StartStop - is_nullable = (CharKind.Next(context) & CharKind.StartStop) != 0; - break; + case SymbolicRegexKind.NWBAnchor: + // test that prev char is word letter iff next is word letter + is_nullable = ((CharKind.Prev(context) & CharKind.WordLetter) ^ (CharKind.Next(context) & CharKind.WordLetter)) == 0; + break; - default: //SymbolicRegexKind.EndAnchorZRev: - // EndAnchorZRev (rev(\Z)) anchor is nullable when the prev character is either the first Newline or Start - // note: CharKind.NewLineS == CharKind.Newline|CharKind.StartStop - Debug.Assert(_kind == SymbolicRegexKind.EndAnchorZRev); - is_nullable = (CharKind.Prev(context) & CharKind.StartStop) != 0; - break; - } + case SymbolicRegexKind.EndAnchorZ: + // \Z anchor is nullable when the next character is either the last Newline or Stop + // note: CharKind.NewLineS == CharKind.Newline|CharKind.StartStop + is_nullable = (CharKind.Next(context) & CharKind.StartStop) != 0; + break; - _nullabilityCache[context] = is_nullable; + default: // SymbolicRegexKind.EndAnchorZRev: + // EndAnchorZRev (rev(\Z)) anchor is nullable when the prev character is either the first Newline or Start + // note: CharKind.NewLineS == CharKind.Newline|CharKind.StartStop + Debug.Assert(_kind == SymbolicRegexKind.EndAnchorZRev); + is_nullable = (CharKind.Prev(context) & CharKind.StartStop) != 0; + break; } + Volatile.Write(ref _nullabilityCache[context], is_nullable ? TrueByte : FalseByte); + return is_nullable; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 6dd56fc109e9b..e7f0b4d298553 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1827,5 +1827,33 @@ public async Task UseRegexConcurrently_ThreadSafe_Success(RegexEngine engine, Ti }, CancellationToken.None, TaskCreationOptions.LongRunning, TaskScheduler.Default)).ToArray()); } } + + [Theory] + [MemberData(nameof(MatchWordsInAnchoredRegexes_TestData))] + public async Task MatchWordsInAnchoredRegexes(RegexEngine engine, RegexOptions options, string pattern, string input, (int, int)[] matches) + { + // The aim of these test is to test corner cases of matches involving anchors + // For NonBacktracking these tests are meant to + // cover most contexts in _nullabilityForContext in SymbolicRegexNode + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, options); + MatchCollection ms = r.Matches(input); + Assert.Equal(matches.Length, ms.Count); + for (int i = 0; i < matches.Length; i++) + { + Assert.Equal(ms[i].Index, matches[i].Item1); + Assert.Equal(ms[i].Length, matches[i].Item2); + } + } + + public static IEnumerable MatchWordsInAnchoredRegexes_TestData() + { + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + yield return new object[] { engine, RegexOptions.None, @"\b\w{10,}\b", "this is a complicated word in a\nnontrivial sentence", new (int, int)[] { (10, 11), (32, 10) } }; + yield return new object[] { engine, RegexOptions.Multiline, @"^\w{10,}\b", "this is a\ncomplicated word in a\nnontrivial sentence", new (int, int)[] { (10, 11), (32, 10) } }; + yield return new object[] { engine, RegexOptions.None, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}\b", "date 10/12/1966 and 10/12/66 are the same", new (int, int)[] { (5, 10), (20, 8) } }; + yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } }; + } + } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs index 85d17ebc4dd48..ee6339561b8bc 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexExperiment.cs @@ -520,8 +520,9 @@ public void TestConjunctionOverCounting(string conjunct1, string conjunct2, stri Assert.Contains("conditional", e.Message); } } + #endregion - + #region Random input generation tests public static IEnumerable GenerateRandomMembers_TestData() { string[] patterns = new string[] { @"pa[5\$s]{2}w[o0]rd$", @"\w\d+", @"\d{10}" }; @@ -536,7 +537,7 @@ public static IEnumerable GenerateRandomMembers_TestData() { foreach (string input in inputs) { - yield return new object[] {engine, pattern, input, !negative }; + yield return new object[] { engine, pattern, input, !negative }; } } }