From 508190319a411a3d8595354d996e55ac5479d000 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 3 Jan 2024 02:05:34 +0100 Subject: [PATCH 1/3] Specialize by length in single-value SearchValues --- .../Helpers/StringSearchValuesHelper.cs | 120 ++++++++++++++++-- .../SingleStringSearchValuesThreeChars.cs | 11 +- .../Strings/StringSearchValues.cs | 55 +++++--- .../Strings/StringSearchValuesBase.cs | 2 +- .../src/System/Text/Ascii.Equality.cs | 3 + 5 files changed, 160 insertions(+), 31 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs index e4d43d925d898..00bce28f39bd7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Text; namespace System.Buffers { @@ -61,7 +62,7 @@ public static bool StartsWith(ref char matchStart, int lengthR return false; } - return TCaseSensitivity.Equals(ref matchStart, candidate); + return TCaseSensitivity.Equals(ref matchStart, candidate); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -79,13 +80,37 @@ private static bool ScalarEquals(ref char matchStart, string c return true; } + public interface IValueLength + { + static abstract bool AtLeast4Chars { get; } + static abstract bool AtLeast8CharsOrUnknown { get; } + } + + public readonly struct ValueLengthLessThan4 : IValueLength + { + public static bool AtLeast4Chars => false; + public static bool AtLeast8CharsOrUnknown => false; + } + + public readonly struct ValueLength4To7 : IValueLength + { + public static bool AtLeast4Chars => true; + public static bool AtLeast8CharsOrUnknown => false; + } + + public readonly struct ValueLength8OrLongerOrUnknown : IValueLength + { + public static bool AtLeast4Chars => true; + public static bool AtLeast8CharsOrUnknown => true; + } + public interface ICaseSensitivity { static abstract char TransformInput(char input); static abstract Vector128 TransformInput(Vector128 input); static abstract Vector256 TransformInput(Vector256 input); static abstract Vector512 TransformInput(Vector512 input); - static abstract bool Equals(ref char matchStart, string candidate); + static abstract bool Equals(ref char matchStart, string candidate) where TValueLength : struct, IValueLength; } // Performs no case transformations. @@ -104,8 +129,37 @@ public interface ICaseSensitivity public static Vector512 TransformInput(Vector512 input) => input; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool Equals(ref char matchStart, string candidate) => - ScalarEquals(ref matchStart, candidate); + public static bool Equals(ref char matchStart, string candidate) + where TValueLength : struct, IValueLength + { + Debug.Assert(candidate.Length > 1); + + ref byte first = ref Unsafe.As(ref matchStart); + ref byte second = ref Unsafe.As(ref candidate.GetRawStringData()); + nuint byteLength = (nuint)(uint)candidate.Length * 2; + + if (TValueLength.AtLeast8CharsOrUnknown) + { + return SpanHelpers.SequenceEqual(ref first, ref second, byteLength); + } + + Debug.Assert(matchStart == candidate[0], "This should only be called after the first character has been checked"); + + if (TValueLength.AtLeast4Chars) + { + nuint offset = byteLength - sizeof(ulong); + ulong differentBits = Unsafe.ReadUnaligned(ref first) - Unsafe.ReadUnaligned(ref second); + differentBits |= Unsafe.ReadUnaligned(ref Unsafe.Add(ref first, offset)) - Unsafe.ReadUnaligned(ref Unsafe.Add(ref second, offset)); + return differentBits == 0; + } + else + { + nuint offset = byteLength - sizeof(uint); + + return Unsafe.ReadUnaligned(ref Unsafe.Add(ref first, offset)) + == Unsafe.ReadUnaligned(ref Unsafe.Add(ref second, offset)); + } + } } // Transforms inputs to their uppercase variants with the assumption that all input characters are ASCII letters. @@ -125,8 +179,38 @@ public static bool Equals(ref char matchStart, string candidate) => public static Vector512 TransformInput(Vector512 input) => input & Vector512.Create(unchecked((byte)~0x20)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool Equals(ref char matchStart, string candidate) => - ScalarEquals(ref matchStart, candidate); + public static bool Equals(ref char matchStart, string candidate) + where TValueLength : struct, IValueLength + { + Debug.Assert(candidate.Length > 1); + Debug.Assert(candidate.ToUpperInvariant() == candidate); + + if (TValueLength.AtLeast8CharsOrUnknown) + { + return Ascii.EqualsIgnoreCase(ref matchStart, ref candidate.GetRawStringData(), (uint)candidate.Length); + } + + ref byte first = ref Unsafe.As(ref matchStart); + ref byte second = ref Unsafe.As(ref candidate.GetRawStringData()); + nuint byteLength = (nuint)(uint)candidate.Length * 2; + + if (TValueLength.AtLeast4Chars) + { + const ulong CaseMask = ~0x20002000200020u; + nuint offset = byteLength - sizeof(ulong); + ulong differentBits = (Unsafe.ReadUnaligned(ref first) & CaseMask) - Unsafe.ReadUnaligned(ref second); + differentBits |= (Unsafe.ReadUnaligned(ref Unsafe.Add(ref first, offset)) & CaseMask) - Unsafe.ReadUnaligned(ref Unsafe.Add(ref second, offset)); + return differentBits == 0; + } + else + { + const uint CaseMask = ~0x200020u; + nuint offset = byteLength - sizeof(uint); + uint differentBits = (Unsafe.ReadUnaligned(ref first) & CaseMask) - Unsafe.ReadUnaligned(ref second); + differentBits |= (Unsafe.ReadUnaligned(ref Unsafe.Add(ref first, offset)) & CaseMask) - Unsafe.ReadUnaligned(ref Unsafe.Add(ref second, offset)); + return differentBits == 0; + } + } } // Transforms inputs to their uppercase variants with the assumption that all input characters are ASCII. @@ -170,8 +254,16 @@ public static Vector512 TransformInput(Vector512 input) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool Equals(ref char matchStart, string candidate) => - ScalarEquals(ref matchStart, candidate); + public static bool Equals(ref char matchStart, string candidate) + where TValueLength : struct, IValueLength + { + if (TValueLength.AtLeast8CharsOrUnknown) + { + return Ascii.EqualsIgnoreCase(ref matchStart, ref candidate.GetRawStringData(), (uint)candidate.Length); + } + + return ScalarEquals(ref matchStart, candidate); + } } // We can't efficiently map non-ASCII inputs to their Ordinal uppercase variants, @@ -184,8 +276,16 @@ public static bool Equals(ref char matchStart, string candidate) => public static Vector512 TransformInput(Vector512 input) => throw new UnreachableException(); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool Equals(ref char matchStart, string candidate) => - Ordinal.EqualsIgnoreCase(ref matchStart, ref candidate.GetRawStringData(), candidate.Length); + public static bool Equals(ref char matchStart, string candidate) + where TValueLength : struct, IValueLength + { + if (TValueLength.AtLeast8CharsOrUnknown) + { + return Ordinal.EqualsIgnoreCase(ref matchStart, ref candidate.GetRawStringData(), candidate.Length); + } + + return Ordinal.EqualsIgnoreCase_Scalar(ref matchStart, ref candidate.GetRawStringData(), candidate.Length); + } } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs index 2f919bfbb8d50..548ef6215540f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs @@ -15,7 +15,8 @@ namespace System.Buffers // This implementation uses 3 precomputed anchor points when searching. // This implementation may also be used for length=2 values, in which case two anchors point at the same position. // Has an O(i * m) worst-case, with the expected time closer to O(n) for most inputs. - internal sealed class SingleStringSearchValuesThreeChars : StringSearchValuesBase + internal sealed class SingleStringSearchValuesThreeChars : StringSearchValuesBase + where TValueLength : struct, IValueLength where TCaseSensitivity : struct, ICaseSensitivity { private const ushort CaseConversionMask = unchecked((ushort)~0x20); @@ -228,7 +229,7 @@ private int IndexOf(ref char searchSpace, int searchSpaceLength) // CaseInsensitiveUnicode doesn't support single-character transformations, so we skip checking the first character first. if ((typeof(TCaseSensitivity) == typeof(CaseInsensitiveUnicode) || TCaseSensitivity.TransformInput(cur) == valueHead) && - TCaseSensitivity.Equals(ref cur, value)) + TCaseSensitivity.Equals(ref cur, value)) { return (int)i; } @@ -325,7 +326,8 @@ private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length); - if (TCaseSensitivity.Equals(ref matchRef, _value)) + if ((typeof(TCaseSensitivity) == typeof(CaseSensitive) && !TValueLength.AtLeast4Chars) || + TCaseSensitivity.Equals(ref matchRef, _value)) { offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2); return true; @@ -353,7 +355,8 @@ private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length); - if (TCaseSensitivity.Equals(ref matchRef, _value)) + if ((typeof(TCaseSensitivity) == typeof(CaseSensitive) && !TValueLength.AtLeast4Chars) || + TCaseSensitivity.Equals(ref matchRef, _value)) { offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2); return true; diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs index e0829ef02c007..3589ed7710834 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs @@ -309,25 +309,16 @@ private static SearchValues CreateForSingleValue( if (Vector128.IsHardwareAccelerated && value.Length > 1 && value.Length <= maxLength) { - if (!ignoreCase) + SearchValues? searchValues = value.Length switch { - return new SingleStringSearchValuesThreeChars(uniqueValues, value); - } - - if (asciiLettersOnly) - { - return new SingleStringSearchValuesThreeChars(uniqueValues, value); - } + < 4 => TryCreateSingleValuesThreeChars(value, uniqueValues, ignoreCase, allAscii, asciiLettersOnly), + < 8 => TryCreateSingleValuesThreeChars(value, uniqueValues, ignoreCase, allAscii, asciiLettersOnly), + _ => TryCreateSingleValuesThreeChars(value, uniqueValues, ignoreCase, allAscii, asciiLettersOnly), + }; - if (allAscii) + if (searchValues is not null) { - return new SingleStringSearchValuesThreeChars(uniqueValues, value); - } - - // When ignoring casing, all anchor chars we search for must be ASCII. - if (char.IsAscii(value[0]) && value.AsSpan().LastIndexOfAnyInRange((char)0, (char)127) > 0) - { - return new SingleStringSearchValuesThreeChars(uniqueValues, value); + return searchValues; } } @@ -338,6 +329,38 @@ private static SearchValues CreateForSingleValue( : new SingleStringSearchValuesFallback(value, uniqueValues); } + private static SearchValues? TryCreateSingleValuesThreeChars( + string value, + HashSet? uniqueValues, + bool ignoreCase, + bool allAscii, + bool asciiLettersOnly) + where TValueLength : struct, IValueLength + { + if (!ignoreCase) + { + return new SingleStringSearchValuesThreeChars(uniqueValues, value); + } + + if (asciiLettersOnly) + { + return new SingleStringSearchValuesThreeChars(uniqueValues, value); + } + + if (allAscii) + { + return new SingleStringSearchValuesThreeChars(uniqueValues, value); + } + + // When ignoring casing, all anchor chars we search for must be ASCII. + if (char.IsAscii(value[0]) && value.AsSpan().LastIndexOfAnyInRange((char)0, (char)127) > 0) + { + return new SingleStringSearchValuesThreeChars(uniqueValues, value); + } + + return null; + } + private static void AnalyzeValues( ReadOnlySpan values, ref bool ignoreCase, diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValuesBase.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValuesBase.cs index 7b21b715a0b29..329ac4813a69b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValuesBase.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValuesBase.cs @@ -18,7 +18,7 @@ internal abstract class StringSearchValuesBase : SearchValues private readonly HashSet? _uniqueValues; /// - /// This exists to allow to avoid the HashSet allocation. + /// This exists to allow to avoid the HashSet allocation. /// protected bool HasUniqueValues => _uniqueValues is not null; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Equality.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Equality.cs index 31df908d9afb0..141ce3b81a354 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Equality.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Equality.cs @@ -189,6 +189,9 @@ public static bool EqualsIgnoreCase(ReadOnlySpan left, ReadOnlySpan => left.Length == right.Length && EqualsIgnoreCase>(ref Unsafe.As(ref MemoryMarshal.GetReference(left)), ref Unsafe.As(ref MemoryMarshal.GetReference(right)), (uint)right.Length); + internal static bool EqualsIgnoreCase(ref char left, ref char right, nuint length) => + EqualsIgnoreCase>(ref Unsafe.As(ref left), ref Unsafe.As(ref right), length); + private static bool EqualsIgnoreCase(ref TLeft left, ref TRight right, nuint length) where TLeft : unmanaged, INumberBase where TRight : unmanaged, INumberBase From 7ac198fee1cb868104ea34854348b5c619ecee19 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 3 Jan 2024 02:29:01 +0100 Subject: [PATCH 2/3] Extra assert --- .../SearchValues/Strings/SingleStringSearchValuesThreeChars.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs index 548ef6215540f..7309b43febd18 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs @@ -35,6 +35,7 @@ public SingleStringSearchValuesThreeChars(HashSet? uniqueValues, string { // We could have more than one entry in 'uniqueValues' if this value is an exact prefix of all the others. Debug.Assert(value.Length > 1); + Debug.Assert((value.Length >= 8) == TValueLength.AtLeast8CharsOrUnknown); CharacterFrequencyHelper.GetSingleStringMultiCharacterOffsets(value, IgnoreCase, out int ch2Offset, out int ch3Offset); From f9cf109ce42b18957672788edda5372d8e23669f Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 3 Jan 2024 17:17:58 +0100 Subject: [PATCH 3/3] More comments --- .../Strings/Helpers/StringSearchValuesHelper.cs | 5 +++++ .../Strings/SingleStringSearchValuesThreeChars.cs | 6 ++++++ .../src/System/SearchValues/Strings/StringSearchValues.cs | 5 +++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs index 00bce28f39bd7..9f1ba0ebf1771 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs @@ -98,6 +98,7 @@ public interface IValueLength public static bool AtLeast8CharsOrUnknown => false; } + // "Unknown" is currently only used by Teddy when confirming matches. public readonly struct ValueLength8OrLongerOrUnknown : IValueLength { public static bool AtLeast4Chars => true; @@ -154,6 +155,10 @@ public static bool Equals(ref char matchStart, string candidate) } else { + Debug.Assert(candidate.Length is 2 or 3); + + // We know that the candidate is 2 or 3 characters long, and that the first character has already been checked. + // We only have to to check the last 2 characters also match. nuint offset = byteLength - sizeof(uint); return Unsafe.ReadUnaligned(ref Unsafe.Add(ref first, offset)) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs index 7309b43febd18..cadfa9f1022ce 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/SingleStringSearchValuesThreeChars.cs @@ -327,6 +327,9 @@ private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length); + // If the value is short (!TValueLength.AtLeast4Chars => 2 or 3 characters), the anchors already represent the whole value. + // With case-sensitive comparisons, we've therefore already confirmed the match, so we can skip doing so here. + // With case-insensitive comparisons, we applied a mask to the input, so while the anchors likely matched, we can't be sure. if ((typeof(TCaseSensitivity) == typeof(CaseSensitive) && !TValueLength.AtLeast4Chars) || TCaseSensitivity.Equals(ref matchRef, _value)) { @@ -356,6 +359,9 @@ private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length); + // If the value is short (!TValueLength.AtLeast4Chars => 2 or 3 characters), the anchors already represent the whole value. + // With case-sensitive comparisons, we've therefore already confirmed the match, so we can skip doing so here. + // With case-insensitive comparisons, we applied a mask to the input, so while the anchors likely matched, we can't be sure. if ((typeof(TCaseSensitivity) == typeof(CaseSensitive) && !TValueLength.AtLeast4Chars) || TCaseSensitivity.Equals(ref matchRef, _value)) { diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs index 3589ed7710834..86a13dd04b9b0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs @@ -352,8 +352,9 @@ private static SearchValues CreateForSingleValue( return new SingleStringSearchValuesThreeChars(uniqueValues, value); } - // When ignoring casing, all anchor chars we search for must be ASCII. - if (char.IsAscii(value[0]) && value.AsSpan().LastIndexOfAnyInRange((char)0, (char)127) > 0) + // SingleStringSearchValuesThreeChars doesn't have logic to handle non-ASCII case conversion, so we require that anchor characters are ASCII. + // Right now we're always selecting the first character as one of the anchors, and we need at least two. + if (char.IsAscii(value[0]) && value.AsSpan(1).ContainsAnyInRange((char)0, (char)127)) { return new SingleStringSearchValuesThreeChars(uniqueValues, value); }