Skip to content

Commit a10c8ae

Browse files
committed
Add Avx512 support
1 parent a0cb8ea commit a10c8ae

File tree

1 file changed

+167
-20
lines changed

1 file changed

+167
-20
lines changed

src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs

+167-20
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ internal static class IndexOfAnyAsciiSearcher
1818
public struct AsciiState(Vector128<byte> bitmap, BitVector256 lookup)
1919
{
2020
public Vector512<byte> Bitmap512 = Vector512.Create(bitmap);
21-
public BitVector256 Lookup = lookup;
21+
public readonly BitVector256 Lookup = lookup;
2222

2323
[MethodImpl(MethodImplOptions.AggressiveInlining)]
2424
public readonly Vector128<byte> Bitmap128() => Bitmap512._lower._lower;
@@ -30,19 +30,31 @@ public readonly AsciiState CreateInverse() =>
3030
new AsciiState(~Bitmap128(), Lookup.CreateInverse());
3131
}
3232

33-
public struct AsciiWithSecondSetState(Vector128<byte> asciiBitmap, ushort offset, Vector128<byte> secondBitmap, ProbabilisticMapState lookup)
33+
public readonly struct AsciiWithSecondSetState(Vector128<byte> asciiBitmap, ushort offset, Vector128<byte> secondBitmap, ProbabilisticMapState lookup)
3434
{
35-
public ushort Offset = offset;
36-
public Vector256<byte> AsciiBitmap = Vector256.Create(asciiBitmap, asciiBitmap);
37-
public Vector256<byte> SecondBitmap = Vector256.Create(secondBitmap, secondBitmap);
38-
public ProbabilisticMapState Lookup = lookup; // Only used for single-character checks.
35+
public readonly ushort Offset = offset;
36+
public readonly Vector512<byte> AsciiBitmap512 = Vector512.Create(asciiBitmap);
37+
public readonly Vector512<byte> SecondBitmap512 = Vector512.Create(secondBitmap);
38+
public readonly ProbabilisticMapState Lookup = lookup; // Only used for single-character checks.
39+
40+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
41+
public readonly Vector128<byte> AsciiBitmap128() => AsciiBitmap512._lower._lower;
42+
43+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
44+
public readonly Vector128<byte> SecondBitmap128() => SecondBitmap512._lower._lower;
45+
46+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
47+
public readonly Vector256<byte> AsciiBitmap256() => AsciiBitmap512._lower;
48+
49+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
50+
public readonly Vector256<byte> SecondBitmap256() => SecondBitmap512._lower;
3951
}
4052

41-
public struct AnyByteState(Vector128<byte> bitmap0, Vector128<byte> bitmap1, BitVector256 lookup)
53+
public readonly struct AnyByteState(Vector128<byte> bitmap0, Vector128<byte> bitmap1, BitVector256 lookup)
4254
{
43-
public Vector512<byte> Bitmap0_512 = Vector512.Create(bitmap0);
44-
public Vector512<byte> Bitmap1_512 = Vector512.Create(bitmap1);
45-
public BitVector256 Lookup = lookup;
55+
public readonly Vector512<byte> Bitmap0_512 = Vector512.Create(bitmap0);
56+
public readonly Vector512<byte> Bitmap1_512 = Vector512.Create(bitmap1);
57+
public readonly BitVector256 Lookup = lookup;
4658

4759
[MethodImpl(MethodImplOptions.AggressiveInlining)]
4860
public readonly Vector128<byte> Bitmap0_128() => Bitmap0_512._lower._lower;
@@ -715,11 +727,72 @@ private static TResult IndexOfAnyCore<TResult, TNegator, TOptimizations, TResult
715727
if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
716728
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
717729
{
718-
Vector256<byte> asciiBitmap256 = state.AsciiBitmap;
719-
Vector256<byte> secondBitmap256 = state.SecondBitmap;
730+
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false
731+
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > 2 * Vector256<short>.Count)
732+
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
733+
{
734+
Vector512<byte> asciiBitmap512 = state.AsciiBitmap512;
735+
Vector512<byte> secondBitmap512 = state.SecondBitmap512;
736+
Vector512<ushort> offset512 = Vector512.Create(state.Offset);
737+
738+
if (searchSpaceLength > 2 * Vector512<short>.Count)
739+
{
740+
// Process the input in chunks of 64 characters (2 * Vector512<short>).
741+
// We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512<byte>.
742+
// As packing two Vector512<short>s into a Vector512<byte> is cheap compared to the lookup, we can effectively double the throughput.
743+
// If the input length is a multiple of 64, don't consume the last 64 characters in this loop.
744+
// Let the fallback below handle it instead. This is why the condition is
745+
// ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
746+
ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector512<short>.Count));
747+
748+
do
749+
{
750+
Vector512<short> source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
751+
Vector512<short> source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512<short>.Count);
752+
753+
Vector512<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, asciiBitmap512, secondBitmap512, offset512);
754+
if (result != Vector512<byte>.Zero)
755+
{
756+
return TResultMapper.FirstIndex<TNegator>(ref searchSpace, ref currentSearchSpace, result);
757+
}
758+
759+
currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512<short>.Count);
760+
}
761+
while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
762+
}
763+
764+
// We have 1-64 characters remaining. Process the first and last vector in the search space.
765+
// They may overlap, but we'll handle that in the index calculation if we do get a match.
766+
Debug.Assert(searchSpaceLength >= Vector512<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
767+
{
768+
ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512<short>.Count);
769+
770+
ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
771+
? ref oneVectorAwayFromEnd
772+
: ref currentSearchSpace;
773+
774+
Vector512<short> source0 = Vector512.LoadUnsafe(ref firstVector);
775+
Vector512<short> source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd);
776+
777+
Vector512<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, asciiBitmap512, secondBitmap512, offset512);
778+
if (result != Vector512<byte>.Zero)
779+
{
780+
return TResultMapper.FirstIndexOverlapped<TNegator>(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
781+
}
782+
}
783+
784+
return TResultMapper.NotFound;
785+
}
786+
787+
Vector256<byte> asciiBitmap256 = state.AsciiBitmap256();
788+
Vector256<byte> secondBitmap256 = state.SecondBitmap256();
720789
Vector256<ushort> offset256 = Vector256.Create(state.Offset);
721790

722-
if (searchSpaceLength > 2 * Vector256<short>.Count)
791+
#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer
792+
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false
793+
if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > 2 * Vector256<short>.Count)
794+
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
795+
#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing
723796
{
724797
// Process the input in chunks of 32 characters (2 * Vector256<short>).
725798
// We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
@@ -768,8 +841,8 @@ private static TResult IndexOfAnyCore<TResult, TNegator, TOptimizations, TResult
768841
return TResultMapper.NotFound;
769842
}
770843

771-
Vector128<byte> asciiBitmap = state.AsciiBitmap._lower;
772-
Vector128<byte> secondBitmap = state.SecondBitmap._lower;
844+
Vector128<byte> asciiBitmap = state.AsciiBitmap128();
845+
Vector128<byte> secondBitmap = state.SecondBitmap128();
773846
Vector128<ushort> offset = Vector128.Create(state.Offset);
774847

775848
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false
@@ -850,11 +923,68 @@ public static int LastIndexOfAny<TNegator, TOptimizations>(ref short searchSpace
850923
if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
851924
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
852925
{
853-
Vector256<byte> asciiBitmap256 = state.AsciiBitmap;
854-
Vector256<byte> secondBitmap256 = state.SecondBitmap;
926+
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > 2 * Vector256<short>.Count)
927+
{
928+
Vector512<byte> asciiBitmap512 = state.AsciiBitmap512;
929+
Vector512<byte> secondBitmap512 = state.SecondBitmap512;
930+
Vector512<ushort> offset512 = Vector512.Create(state.Offset);
931+
932+
if (searchSpaceLength > 2 * Vector512<short>.Count)
933+
{
934+
// Process the input in chunks of 64 characters (2 * Vector512<short>).
935+
// We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512<byte>.
936+
// As packing two Vector512<short>s into a Vector512<byte> is cheap compared to the lookup, we can effectively double the throughput.
937+
// If the input length is a multiple of 64, don't consume the last 64 characters in this loop.
938+
// Let the fallback below handle it instead. This is why the condition is
939+
// ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
940+
ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector512<short>.Count);
941+
942+
do
943+
{
944+
currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector512<short>.Count);
945+
946+
Vector512<short> source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
947+
Vector512<short> source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512<short>.Count);
948+
949+
Vector512<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, asciiBitmap512, secondBitmap512, offset512);
950+
if (result != Vector512<byte>.Zero)
951+
{
952+
return ComputeLastIndex<short, TNegator>(ref searchSpace, ref currentSearchSpace, result);
953+
}
954+
}
955+
while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart));
956+
}
957+
958+
// We have 1-64 characters remaining. Process the first and last vector in the search space.
959+
// They may overlap, but we'll handle that in the index calculation if we do get a match.
960+
Debug.Assert(searchSpaceLength >= Vector512<short>.Count, "We expect that the input is long enough for us to load a whole vector.");
961+
{
962+
ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512<short>.Count);
963+
964+
ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart)
965+
? ref Unsafe.Subtract(ref currentSearchSpace, Vector512<short>.Count)
966+
: ref searchSpace;
967+
968+
Vector512<short> source0 = Vector512.LoadUnsafe(ref searchSpace);
969+
Vector512<short> source1 = Vector512.LoadUnsafe(ref secondVector);
970+
971+
Vector512<byte> result = IndexOfAnyLookup<TNegator, TOptimizations>(source0, source1, asciiBitmap512, secondBitmap512, offset512);
972+
if (result != Vector512<byte>.Zero)
973+
{
974+
return ComputeLastIndexOverlapped<short, TNegator>(ref searchSpace, ref secondVector, result);
975+
}
976+
}
977+
978+
return -1;
979+
}
980+
981+
Vector256<byte> asciiBitmap256 = state.AsciiBitmap256();
982+
Vector256<byte> secondBitmap256 = state.SecondBitmap256();
855983
Vector256<ushort> offset256 = Vector256.Create(state.Offset);
856984

857-
if (searchSpaceLength > 2 * Vector256<short>.Count)
985+
#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer
986+
if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > 2 * Vector256<short>.Count)
987+
#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing
858988
{
859989
// Process the input in chunks of 32 characters (2 * Vector256<short>).
860990
// We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
@@ -903,8 +1033,8 @@ public static int LastIndexOfAny<TNegator, TOptimizations>(ref short searchSpace
9031033
return -1;
9041034
}
9051035

906-
Vector128<byte> asciiBitmap = state.AsciiBitmap._lower;
907-
Vector128<byte> secondBitmap = state.SecondBitmap._lower;
1036+
Vector128<byte> asciiBitmap = state.AsciiBitmap128();
1037+
Vector128<byte> secondBitmap = state.SecondBitmap128();
9081038
Vector128<ushort> offset = Vector128.Create(state.Offset);
9091039

9101040
if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128<short>.Count)
@@ -1845,6 +1975,23 @@ private static Vector512<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector
18451975
return TNegator.NegateIfNeeded(result);
18461976
}
18471977

1978+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1979+
[CompExactlyDependsOn(typeof(Avx512BW))]
1980+
private static Vector512<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector512<short> source0, Vector512<short> source1, Vector512<byte> bitmapLookup0, Vector512<byte> bitmapLookup1, Vector512<ushort> offset)
1981+
where TNegator : struct, INegator
1982+
where TOptimizations : struct, IOptimizations
1983+
{
1984+
Debug.Assert((bitmapLookup1[0] & 1) == 0, "The 0th bit in second bitmap shouldn't be set.");
1985+
1986+
Vector512<byte> packed0 = TOptimizations.PackSources(source0.AsUInt16(), source1.AsUInt16());
1987+
Vector512<byte> packed1 = Default.PackSources(source0.AsUInt16() - offset, source1.AsUInt16() - offset);
1988+
1989+
Vector512<byte> result0 = IndexOfAnyLookupCore(packed0, bitmapLookup0);
1990+
Vector512<byte> result1 = IndexOfAnyLookupCore(packed1, bitmapLookup1);
1991+
1992+
return TNegator.NegateIfNeeded(result0 | result1);
1993+
}
1994+
18481995
[MethodImpl(MethodImplOptions.AggressiveInlining)]
18491996
[CompExactlyDependsOn(typeof(Avx512BW))]
18501997
private static Vector512<byte> IndexOfAnyLookupCore(Vector512<byte> source, Vector512<byte> bitmapLookup)

0 commit comments

Comments
 (0)