From a48a39fbb29aa745713c4f1028bcc9df84a5da71 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Thu, 11 Jul 2024 00:40:26 +0200 Subject: [PATCH] Revert "Add Avx512 support to IndexOfAnyAsciiSearcher (#103710)" (#104688) * Revert "Add Avx512 support to IndexOfAnyAsciiSearcher (#103710)" This reverts commit ce1ae77f9fc565facdf041a740e48935e7b60c3b. * Keep the test improvements * Keep the inlining changes * Keep the shift improvements --- .../SearchValues/IndexOfAnyAsciiSearcher.cs | 567 +----------------- .../Strings/Helpers/AhoCorasick.cs | 2 +- 2 files changed, 31 insertions(+), 538 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs index ec5cfef5610fd3..bf06e365f40c21 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs @@ -17,36 +17,18 @@ internal static class IndexOfAnyAsciiSearcher { public struct AsciiState(Vector128 bitmap, BitVector256 lookup) { - public Vector512 Bitmap512 = Vector512.Create(bitmap); + public Vector256 Bitmap = Vector256.Create(bitmap); public BitVector256 Lookup = lookup; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly Vector128 Bitmap128() => Bitmap512._lower._lower; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly Vector256 Bitmap256() => Bitmap512._lower; - public readonly AsciiState CreateInverse() => - new AsciiState(~Bitmap128(), Lookup.CreateInverse()); + new AsciiState(~Bitmap._lower, Lookup.CreateInverse()); } public struct AnyByteState(Vector128 bitmap0, Vector128 bitmap1, BitVector256 lookup) { - public Vector512 Bitmap0_512 = Vector512.Create(bitmap0); - public Vector512 Bitmap1_512 = Vector512.Create(bitmap1); + public Vector256 Bitmap0 = Vector256.Create(bitmap0); + public Vector256 Bitmap1 = Vector256.Create(bitmap1); public BitVector256 Lookup = lookup; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly Vector128 Bitmap0_128() => Bitmap0_512._lower._lower; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly Vector128 Bitmap1_128() => Bitmap1_512._lower._lower; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly Vector256 Bitmap0_256() => Bitmap0_512._lower; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public readonly Vector256 Bitmap1_256() => Bitmap1_512._lower; } internal static bool IsVectorizationSupported => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported; @@ -160,11 +142,11 @@ private static unsafe bool TryIndexOfAny(ref short searchSpace, int se { AsciiState state = default; - if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap512._lower._lower, out bool needleContainsZero)) + if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap._lower, out bool needleContainsZero)) { // Only initializing the bitmap here is okay as we can only get here if the search space is long enough // and we support vectorization, so the IndexOfAnyVectorized implementation will never touch state.Lookup. - state.Bitmap512 = Vector512.Create(state.Bitmap128()); + state.Bitmap = Vector256.Create(state.Bitmap.GetLower()); index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero ? IndexOfAny(ref searchSpace, searchSpaceLength, ref state) @@ -187,11 +169,11 @@ private static unsafe bool TryLastIndexOfAny(ref short searchSpace, in { AsciiState state = default; - if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap512._lower._lower, out bool needleContainsZero)) + if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap._lower, out bool needleContainsZero)) { // Only initializing the bitmap here is okay as we can only get here if the search space is long enough // and we support vectorization, so the LastIndexOfAnyVectorized implementation will never touch state.Lookup. - state.Bitmap512 = Vector512.Create(state.Bitmap128()); + state.Bitmap = Vector256.Create(state.Bitmap.GetLower()); index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero ? LastIndexOfAny(ref searchSpace, searchSpaceLength, ref state) @@ -255,68 +237,9 @@ private static TResult IndexOfAnyCore 2 * Vector128.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { -#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > 2 * Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough - { - Vector512 bitmap512 = state.Bitmap512; - - if (searchSpaceLength > 2 * Vector512.Count) - { - // Process the input in chunks of 64 characters (2 * Vector512). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512. - // As packing two Vector512s into a Vector512 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector512.Count)); - - do - { - Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); - Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); - - Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); - if (result != Vector512.Zero) - { - return TResultMapper.FirstIndex(ref searchSpace, ref currentSearchSpace, result); - } - - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); - } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); - } - - // We have 1-64 characters remaining. Process the first and last vector in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector512.Count, "We expect that the input is long enough for us to load a whole vector."); - { - ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512.Count); - - ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) - ? ref oneVectorAwayFromEnd - : ref currentSearchSpace; - - Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); - Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); - - Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); - if (result != Vector512.Zero) - { - return TResultMapper.FirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); - } - } + Vector256 bitmap256 = state.Bitmap; - return TResultMapper.NotFound; - } - - Vector256 bitmap256 = state.Bitmap256(); - -#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer -#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false - if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > 2 * Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough -#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing + if (searchSpaceLength > 2 * Vector256.Count) { // Process the input in chunks of 32 characters (2 * Vector256). // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. @@ -365,7 +288,7 @@ private static TResult IndexOfAnyCore bitmap = state.Bitmap128(); + Vector128 bitmap = state.Bitmap._lower; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) @@ -445,64 +368,9 @@ public static int LastIndexOfAny(ref short searchSpace if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > 2 * Vector256.Count) - { - Vector512 bitmap512 = state.Bitmap512; - - if (searchSpaceLength > 2 * Vector512.Count) - { - // Process the input in chunks of 64 characters (2 * Vector512). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512. - // As packing two Vector512s into a Vector512 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector512.Count); - - do - { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector512.Count); - - Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); - Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); - - Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); - if (result != Vector512.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } - } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); - } - - // We have 1-64 characters remaining. Process the first and last vector in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector512.Count, "We expect that the input is long enough for us to load a whole vector."); - { - ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512.Count); + Vector256 bitmap256 = state.Bitmap; - ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector512.Count) - : ref searchSpace; - - Vector512 source0 = Vector512.LoadUnsafe(ref searchSpace); - Vector512 source1 = Vector512.LoadUnsafe(ref secondVector); - - Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); - if (result != Vector512.Zero) - { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); - } - } - - return -1; - } - - Vector256 bitmap256 = state.Bitmap256(); - -#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer - if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > 2 * Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing + if (searchSpaceLength > 2 * Vector256.Count) { // Process the input in chunks of 32 characters (2 * Vector256). // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. @@ -551,7 +419,7 @@ public static int LastIndexOfAny(ref short searchSpace return -1; } - Vector128 bitmap = state.Bitmap128(); + Vector128 bitmap = state.Bitmap._lower; if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) { @@ -650,62 +518,9 @@ private static TResult IndexOfAnyCore(ref byte if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) - { - Vector512 bitmap512 = state.Bitmap512; - - if (searchSpaceLength > Vector512.Count) - { - // Process the input in chunks of 64 bytes. - // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512.Count); - - do - { - Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector256 bitmap256 = state.Bitmap; - Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); - if (result != Vector512.Zero) - { - return TResultMapper.FirstIndex(ref searchSpace, ref currentSearchSpace, result); - } - - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); - } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); - } - - // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); - { - ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); - - ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) - ? ref halfVectorAwayFromEnd - : ref currentSearchSpace; - - Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); - Vector256 source1 = Vector256.LoadUnsafe(ref halfVectorAwayFromEnd); - Vector512 source = Vector512.Create(source0, source1); - - Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); - if (result != Vector512.Zero) - { - return TResultMapper.FirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); - } - } - - return TResultMapper.NotFound; - } - - Vector256 bitmap256 = state.Bitmap256(); - -#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer - if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing + if (searchSpaceLength > Vector256.Count) { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -752,7 +567,7 @@ private static TResult IndexOfAnyCore(ref byte return TResultMapper.NotFound; } - Vector128 bitmap = state.Bitmap128(); + Vector128 bitmap = state.Bitmap._lower; if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) { @@ -827,62 +642,9 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) - { - Vector512 bitmap512 = state.Bitmap512; - - if (searchSpaceLength > Vector512.Count) - { - // Process the input in chunks of 64 bytes. - // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512.Count); - - do - { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector512.Count); - - Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); - - Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); - if (result != Vector512.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } - } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); - } - - // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); - { - ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); + Vector256 bitmap256 = state.Bitmap; - ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) - : ref searchSpace; - - Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); - Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); - Vector512 source = Vector512.Create(source0, source1); - - Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); - if (result != Vector512.Zero) - { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); - } - } - - return -1; - } - - Vector256 bitmap256 = state.Bitmap256(); - -#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer - if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing + if (searchSpaceLength > Vector256.Count) { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -929,7 +691,7 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace return -1; } - Vector128 bitmap = state.Bitmap128(); + Vector128 bitmap = state.Bitmap._lower; if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) { @@ -1026,68 +788,10 @@ private static TResult IndexOfAnyCore(ref byte if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { -#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough - { - Vector512 bitmap512_0 = state.Bitmap0_512; - Vector512 bitmap512_1 = state.Bitmap1_512; - - if (searchSpaceLength > Vector512.Count) - { - // Process the input in chunks of 64 bytes. - // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512.Count); - - do - { - Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); - - Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); - if (result != Vector512.Zero) - { - return TResultMapper.FirstIndex(ref searchSpace, ref currentSearchSpace, result); - } - - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); - } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); - } - - // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); - { - ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); - - ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) - ? ref halfVectorAwayFromEnd - : ref currentSearchSpace; - - Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); - Vector256 source1 = Vector256.LoadUnsafe(ref halfVectorAwayFromEnd); - Vector512 source = Vector512.Create(source0, source1); - - Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); - if (result != Vector512.Zero) - { - return TResultMapper.FirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); - } - } + Vector256 bitmap256_0 = state.Bitmap0; + Vector256 bitmap256_1 = state.Bitmap1; - return TResultMapper.NotFound; - } - - Vector256 bitmap256_0 = state.Bitmap0_256(); - Vector256 bitmap256_1 = state.Bitmap1_256(); - -#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer -#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false - if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough -#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing + if (searchSpaceLength > Vector256.Count) { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -1134,8 +838,8 @@ private static TResult IndexOfAnyCore(ref byte return TResultMapper.NotFound; } - Vector128 bitmap0 = state.Bitmap0_128(); - Vector128 bitmap1 = state.Bitmap1_128(); + Vector128 bitmap0 = state.Bitmap0._lower; + Vector128 bitmap1 = state.Bitmap1._lower; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) @@ -1210,70 +914,12 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { -#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false - if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough - { - Vector512 bitmap512_0 = state.Bitmap0_512; - Vector512 bitmap512_1 = state.Bitmap1_512; - - if (searchSpaceLength > Vector512.Count) - { - // Process the input in chunks of 64 bytes. - // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512.Count); + Vector256 bitmap256_0 = state.Bitmap0; + Vector256 bitmap256_1 = state.Bitmap1; - do - { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector512.Count); - - Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); - - Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); - if (result != Vector512.Zero) - { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); - } - } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); - } - - // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. - // They may overlap, but we'll handle that in the index calculation if we do get a match. - Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); - { - ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - - ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) - : ref searchSpace; - - Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); - Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); - Vector512 source = Vector512.Create(source0, source1); - - Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); - if (result != Vector512.Zero) - { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); - } - } - - return -1; - } - - Vector256 bitmap256_0 = state.Bitmap0_256(); - Vector256 bitmap256_1 = state.Bitmap1_256(); - -#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer -#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false - if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) -#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough -#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing + if (searchSpaceLength > Vector256.Count) { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -1320,8 +966,8 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace return -1; } - Vector128 bitmap0 = state.Bitmap0_128(); - Vector128 bitmap1 = state.Bitmap1_128(); + Vector128 bitmap0 = state.Bitmap0._lower; + Vector128 bitmap1 = state.Bitmap1._lower; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) @@ -1442,31 +1088,6 @@ private static Vector256 IndexOfAnyLookupCore(Vector256 source, Vect return result; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512BW))] - private static Vector512 IndexOfAnyLookup(Vector512 source0, Vector512 source1, Vector512 bitmapLookup) - where TNegator : struct, INegator - where TOptimizations : struct, IOptimizations - { - Vector512 source = TOptimizations.PackSources(source0.AsUInt16(), source1.AsUInt16()); - - Vector512 result = IndexOfAnyLookupCore(source, bitmapLookup); - - return TNegator.NegateIfNeeded(result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512BW))] - private static Vector512 IndexOfAnyLookupCore(Vector512 source, Vector512 bitmapLookup) - { - // See comments in IndexOfAnyLookupCore(Vector128) above for more details. - Vector512 highNibbles = source >>> 4; - Vector512 bitMask = Avx512BW.Shuffle(bitmapLookup, source); - Vector512 bitPositions = Avx512BW.Shuffle(Vector512.Create(0x8040201008040201).AsByte(), highNibbles); - Vector512 result = bitMask & bitPositions; - return result; - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd))] @@ -1515,29 +1136,6 @@ private static Vector256 IndexOfAnyLookup(Vector256 source return TNegator.NegateIfNeeded(result); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512BW))] - private static Vector512 IndexOfAnyLookup(Vector512 source, Vector512 bitmapLookup0, Vector512 bitmapLookup1) - where TNegator : struct, INegator - { - // http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm - - Vector512 lowNibbles = source & Vector512.Create((byte)0xF); - Vector512 highNibbles = source >>> 4; - - Vector512 row0 = Avx512BW.Shuffle(bitmapLookup0, lowNibbles); - Vector512 row1 = Avx512BW.Shuffle(bitmapLookup1, lowNibbles); - - Vector512 bitmask = Avx512BW.Shuffle(Vector512.Create(0x8040201008040201).AsByte(), highNibbles); - - Vector512 mask = Vector512.GreaterThan(highNibbles.AsSByte(), Vector512.Create((sbyte)0x7)).AsByte(); - Vector512 bitsets = Vector512.ConditionalSelect(mask, row1, row0); - - Vector512 result = Vector512.Equals(bitsets & bitmask, bitmask); - - return TNegator.NegateIfNeeded(result); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator @@ -1600,53 +1198,13 @@ private static unsafe int ComputeLastIndexOverlapped(ref T searchSp return offsetInVector - Vector256.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512F))] - private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector512 result) - where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = PackedSpanHelpers.FixUpPackedVector512Result(result); - } - - ulong mask = TNegator.ExtractMask(result); - - int offsetInVector = 63 - BitOperations.LeadingZeroCount(mask); - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512F))] - private static unsafe int ComputeLastIndexOverlapped(ref T searchSpace, ref T secondVector, Vector512 result) - where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = PackedSpanHelpers.FixUpPackedVector512Result(result); - } - - ulong mask = TNegator.ExtractMask(result); - - int offsetInVector = 63 - BitOperations.LeadingZeroCount(mask); - if (offsetInVector < Vector512.Count) - { - return offsetInVector; - } - - // We matched within the second vector - return offsetInVector - Vector512.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T)); - } - internal interface INegator { static abstract bool NegateIfNeeded(bool result); static abstract Vector128 NegateIfNeeded(Vector128 result); static abstract Vector256 NegateIfNeeded(Vector256 result); - static abstract Vector512 NegateIfNeeded(Vector512 result); static abstract uint ExtractMask(Vector128 result); static abstract uint ExtractMask(Vector256 result); - static abstract ulong ExtractMask(Vector512 result); } internal readonly struct DontNegate : INegator @@ -1654,13 +1212,10 @@ internal interface INegator public static bool NegateIfNeeded(bool result) => result; public static Vector128 NegateIfNeeded(Vector128 result) => result; public static Vector256 NegateIfNeeded(Vector256 result) => result; - public static Vector512 NegateIfNeeded(Vector512 result) => result; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector128 result) => ~Vector128.Equals(result, Vector128.Zero).ExtractMostSignificantBits(); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector256 result) => ~Vector256.Equals(result, Vector256.Zero).ExtractMostSignificantBits(); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static ulong ExtractMask(Vector512 result) => ~Vector512.Equals(result, Vector512.Zero).ExtractMostSignificantBits(); } internal readonly struct Negate : INegator @@ -1673,13 +1228,9 @@ internal interface INegator [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 NegateIfNeeded(Vector256 result) => Vector256.Equals(result, Vector256.Zero); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector512 NegateIfNeeded(Vector512 result) => Vector512.Equals(result, Vector512.Zero); - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector128 result) => result.ExtractMostSignificantBits(); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector256 result) => result.ExtractMostSignificantBits(); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static ulong ExtractMask(Vector512 result) => result.ExtractMostSignificantBits(); } internal interface IOptimizations @@ -1692,7 +1243,6 @@ internal interface IOptimizations // - All values result in min(value, 255) static abstract Vector128 PackSources(Vector128 lower, Vector128 upper); static abstract Vector256 PackSources(Vector256 lower, Vector256 upper); - static abstract Vector512 PackSources(Vector512 lower, Vector512 upper); } internal readonly struct Ssse3AndWasmHandleZeroInNeedle : IOptimizations @@ -1720,16 +1270,6 @@ public static Vector256 PackSources(Vector256 lower, Vector256 PackSources(Vector512 lower, Vector512 upper) - { - return Avx512BW.PackUnsignedSaturate( - Vector512.Min(lower, Vector512.Create((ushort)255)).AsInt16(), - Vector512.Min(upper, Vector512.Create((ushort)255)).AsInt16()); - } } internal readonly struct Default : IOptimizations @@ -1752,13 +1292,6 @@ public static Vector256 PackSources(Vector256 lower, Vector256 PackSources(Vector512 lower, Vector512 upper) - { - return Avx512BW.PackUnsignedSaturate(lower.AsInt16(), upper.AsInt16()); - } } private interface IResultMapper @@ -1769,10 +1302,8 @@ private interface IResultMapper static abstract TResult ScalarResult(ref T searchSpace, ref T current); static abstract TResult FirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator; static abstract TResult FirstIndex(ref T searchSpace, ref T current, Vector256 result) where TNegator : struct, INegator; - static abstract TResult FirstIndex(ref T searchSpace, ref T current, Vector512 result) where TNegator : struct, INegator; static abstract TResult FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator; static abstract TResult FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector256 result) where TNegator : struct, INegator; - static abstract TResult FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector512 result) where TNegator : struct, INegator; } private readonly struct ContainsAnyResultMapper : IResultMapper @@ -1782,10 +1313,8 @@ private interface IResultMapper public static bool ScalarResult(ref T searchSpace, ref T current) => true; public static bool FirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator => true; public static bool FirstIndex(ref T searchSpace, ref T current, Vector256 result) where TNegator : struct, INegator => true; - public static bool FirstIndex(ref T searchSpace, ref T current, Vector512 result) where TNegator : struct, INegator => true; public static bool FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator => true; public static bool FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector256 result) where TNegator : struct, INegator => true; - public static bool FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector512 result) where TNegator : struct, INegator => true; } private readonly unsafe struct IndexOfAnyResultMapper : IResultMapper @@ -1821,21 +1350,6 @@ public static int FirstIndex(ref T searchSpace, ref T current, Vector2 return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512F))] - public static int FirstIndex(ref T searchSpace, ref T current, Vector512 result) where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = PackedSpanHelpers.FixUpPackedVector512Result(result); - } - - ulong mask = TNegator.ExtractMask(result); - - int offsetInVector = BitOperations.TrailingZeroCount(mask); - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator { @@ -1870,27 +1384,6 @@ public static int FirstIndexOverlapped(ref T searchSpace, ref T curren } return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / (nuint)sizeof(T)); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512F))] - public static int FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector512 result) where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = PackedSpanHelpers.FixUpPackedVector512Result(result); - } - - ulong mask = TNegator.ExtractMask(result); - - int offsetInVector = BitOperations.TrailingZeroCount(mask); - if (offsetInVector >= Vector512.Count) - { - // We matched within the second vector - current0 = ref current1; - offsetInVector -= Vector512.Count; - } - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / (nuint)sizeof(T)); - } } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs index 777251124cbbea..ebc94616ae642c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs @@ -30,7 +30,7 @@ public readonly bool ShouldUseAsciiFastScan { get { - if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && _startingAsciiChars.Bitmap128() != default) + if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && _startingAsciiChars.Bitmap != default) { // If there are a lot of starting characters such that we often find one early, // the ASCII fast scan may end up performing worse than checking one character at a time.