@@ -18,7 +18,7 @@ internal static class IndexOfAnyAsciiSearcher
18
18
public struct AsciiState ( Vector128 < byte > bitmap , BitVector256 lookup )
19
19
{
20
20
public Vector512 < byte > Bitmap512 = Vector512 . Create ( bitmap ) ;
21
- public BitVector256 Lookup = lookup ;
21
+ public readonly BitVector256 Lookup = lookup ;
22
22
23
23
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
24
24
public readonly Vector128 < byte > Bitmap128 ( ) => Bitmap512 . _lower . _lower ;
@@ -30,19 +30,31 @@ public readonly AsciiState CreateInverse() =>
30
30
new AsciiState ( ~ Bitmap128 ( ) , Lookup . CreateInverse ( ) ) ;
31
31
}
32
32
33
- public struct AsciiWithSecondSetState ( Vector128 < byte > asciiBitmap , ushort offset , Vector128 < byte > secondBitmap , ProbabilisticMapState lookup )
33
+ public readonly struct AsciiWithSecondSetState ( Vector128 < byte > asciiBitmap , ushort offset , Vector128 < byte > secondBitmap , ProbabilisticMapState lookup )
34
34
{
35
- public ushort Offset = offset ;
36
- public Vector256 < byte > AsciiBitmap = Vector256 . Create ( asciiBitmap , asciiBitmap ) ;
37
- public Vector256 < byte > SecondBitmap = Vector256 . Create ( secondBitmap , secondBitmap ) ;
38
- public ProbabilisticMapState Lookup = lookup ; // Only used for single-character checks.
35
+ public readonly ushort Offset = offset ;
36
+ public readonly Vector512 < byte > AsciiBitmap512 = Vector512 . Create ( asciiBitmap ) ;
37
+ public readonly Vector512 < byte > SecondBitmap512 = Vector512 . Create ( secondBitmap ) ;
38
+ public readonly ProbabilisticMapState Lookup = lookup ; // Only used for single-character checks.
39
+
40
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
41
+ public readonly Vector128 < byte > AsciiBitmap128 ( ) => AsciiBitmap512 . _lower . _lower ;
42
+
43
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
44
+ public readonly Vector128 < byte > SecondBitmap128 ( ) => SecondBitmap512 . _lower . _lower ;
45
+
46
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
47
+ public readonly Vector256 < byte > AsciiBitmap256 ( ) => AsciiBitmap512 . _lower ;
48
+
49
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
50
+ public readonly Vector256 < byte > SecondBitmap256 ( ) => SecondBitmap512 . _lower ;
39
51
}
40
52
41
- public struct AnyByteState ( Vector128 < byte > bitmap0 , Vector128 < byte > bitmap1 , BitVector256 lookup )
53
+ public readonly struct AnyByteState ( Vector128 < byte > bitmap0 , Vector128 < byte > bitmap1 , BitVector256 lookup )
42
54
{
43
- public Vector512 < byte > Bitmap0_512 = Vector512 . Create ( bitmap0 ) ;
44
- public Vector512 < byte > Bitmap1_512 = Vector512 . Create ( bitmap1 ) ;
45
- public BitVector256 Lookup = lookup ;
55
+ public readonly Vector512 < byte > Bitmap0_512 = Vector512 . Create ( bitmap0 ) ;
56
+ public readonly Vector512 < byte > Bitmap1_512 = Vector512 . Create ( bitmap1 ) ;
57
+ public readonly BitVector256 Lookup = lookup ;
46
58
47
59
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
48
60
public readonly Vector128 < byte > Bitmap0_128 ( ) => Bitmap0_512 . _lower . _lower ;
@@ -715,11 +727,72 @@ private static TResult IndexOfAnyCore<TResult, TNegator, TOptimizations, TResult
715
727
if ( Avx2 . IsSupported && searchSpaceLength > 2 * Vector128 < short > . Count )
716
728
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
717
729
{
718
- Vector256 < byte > asciiBitmap256 = state . AsciiBitmap ;
719
- Vector256 < byte > secondBitmap256 = state . SecondBitmap ;
730
+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false
731
+ if ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported && searchSpaceLength > 2 * Vector256 < short > . Count )
732
+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
733
+ {
734
+ Vector512 < byte > asciiBitmap512 = state . AsciiBitmap512 ;
735
+ Vector512 < byte > secondBitmap512 = state . SecondBitmap512 ;
736
+ Vector512 < ushort > offset512 = Vector512 . Create ( state . Offset ) ;
737
+
738
+ if ( searchSpaceLength > 2 * Vector512 < short > . Count )
739
+ {
740
+ // Process the input in chunks of 64 characters (2 * Vector512<short>).
741
+ // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512<byte>.
742
+ // As packing two Vector512<short>s into a Vector512<byte> is cheap compared to the lookup, we can effectively double the throughput.
743
+ // If the input length is a multiple of 64, don't consume the last 64 characters in this loop.
744
+ // Let the fallback below handle it instead. This is why the condition is
745
+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
746
+ ref short twoVectorsAwayFromEnd = ref Unsafe . Add ( ref searchSpace , searchSpaceLength - ( 2 * Vector512 < short > . Count ) ) ;
747
+
748
+ do
749
+ {
750
+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref currentSearchSpace ) ;
751
+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref currentSearchSpace , ( nuint ) Vector512 < short > . Count ) ;
752
+
753
+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
754
+ if ( result != Vector512 < byte > . Zero )
755
+ {
756
+ return TResultMapper . FirstIndex < TNegator > ( ref searchSpace , ref currentSearchSpace , result ) ;
757
+ }
758
+
759
+ currentSearchSpace = ref Unsafe . Add ( ref currentSearchSpace , 2 * Vector512 < short > . Count ) ;
760
+ }
761
+ while ( Unsafe . IsAddressLessThan ( ref currentSearchSpace , ref twoVectorsAwayFromEnd ) ) ;
762
+ }
763
+
764
+ // We have 1-64 characters remaining. Process the first and last vector in the search space.
765
+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
766
+ Debug . Assert ( searchSpaceLength >= Vector512 < short > . Count , "We expect that the input is long enough for us to load a whole vector." ) ;
767
+ {
768
+ ref short oneVectorAwayFromEnd = ref Unsafe . Add ( ref searchSpace , searchSpaceLength - Vector512 < short > . Count ) ;
769
+
770
+ ref short firstVector = ref Unsafe . IsAddressGreaterThan ( ref currentSearchSpace , ref oneVectorAwayFromEnd )
771
+ ? ref oneVectorAwayFromEnd
772
+ : ref currentSearchSpace ;
773
+
774
+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref firstVector ) ;
775
+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref oneVectorAwayFromEnd ) ;
776
+
777
+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
778
+ if ( result != Vector512 < byte > . Zero )
779
+ {
780
+ return TResultMapper . FirstIndexOverlapped < TNegator > ( ref searchSpace , ref firstVector , ref oneVectorAwayFromEnd , result ) ;
781
+ }
782
+ }
783
+
784
+ return TResultMapper . NotFound ;
785
+ }
786
+
787
+ Vector256 < byte > asciiBitmap256 = state . AsciiBitmap256 ( ) ;
788
+ Vector256 < byte > secondBitmap256 = state . SecondBitmap256 ( ) ;
720
789
Vector256 < ushort > offset256 = Vector256 . Create ( state . Offset ) ;
721
790
722
- if ( searchSpaceLength > 2 * Vector256 < short > . Count )
791
+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer
792
+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false
793
+ if ( ! ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported ) && searchSpaceLength > 2 * Vector256 < short > . Count )
794
+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
795
+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing
723
796
{
724
797
// Process the input in chunks of 32 characters (2 * Vector256<short>).
725
798
// We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
@@ -768,8 +841,8 @@ private static TResult IndexOfAnyCore<TResult, TNegator, TOptimizations, TResult
768
841
return TResultMapper . NotFound ;
769
842
}
770
843
771
- Vector128 < byte > asciiBitmap = state . AsciiBitmap . _lower ;
772
- Vector128 < byte > secondBitmap = state . SecondBitmap . _lower ;
844
+ Vector128 < byte > asciiBitmap = state . AsciiBitmap128 ( ) ;
845
+ Vector128 < byte > secondBitmap = state . SecondBitmap128 ( ) ;
773
846
Vector128 < ushort > offset = Vector128 . Create ( state . Offset ) ;
774
847
775
848
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false
@@ -850,11 +923,68 @@ public static int LastIndexOfAny<TNegator, TOptimizations>(ref short searchSpace
850
923
if ( Avx2 . IsSupported && searchSpaceLength > 2 * Vector128 < short > . Count )
851
924
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
852
925
{
853
- Vector256 < byte > asciiBitmap256 = state . AsciiBitmap ;
854
- Vector256 < byte > secondBitmap256 = state . SecondBitmap ;
926
+ if ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported && searchSpaceLength > 2 * Vector256 < short > . Count )
927
+ {
928
+ Vector512 < byte > asciiBitmap512 = state . AsciiBitmap512 ;
929
+ Vector512 < byte > secondBitmap512 = state . SecondBitmap512 ;
930
+ Vector512 < ushort > offset512 = Vector512 . Create ( state . Offset ) ;
931
+
932
+ if ( searchSpaceLength > 2 * Vector512 < short > . Count )
933
+ {
934
+ // Process the input in chunks of 64 characters (2 * Vector512<short>).
935
+ // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512<byte>.
936
+ // As packing two Vector512<short>s into a Vector512<byte> is cheap compared to the lookup, we can effectively double the throughput.
937
+ // If the input length is a multiple of 64, don't consume the last 64 characters in this loop.
938
+ // Let the fallback below handle it instead. This is why the condition is
939
+ // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
940
+ ref short twoVectorsAfterStart = ref Unsafe . Add ( ref searchSpace , 2 * Vector512 < short > . Count ) ;
941
+
942
+ do
943
+ {
944
+ currentSearchSpace = ref Unsafe . Subtract ( ref currentSearchSpace , 2 * Vector512 < short > . Count ) ;
945
+
946
+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref currentSearchSpace ) ;
947
+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref currentSearchSpace , ( nuint ) Vector512 < short > . Count ) ;
948
+
949
+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
950
+ if ( result != Vector512 < byte > . Zero )
951
+ {
952
+ return ComputeLastIndex < short , TNegator > ( ref searchSpace , ref currentSearchSpace , result ) ;
953
+ }
954
+ }
955
+ while ( Unsafe . IsAddressGreaterThan ( ref currentSearchSpace , ref twoVectorsAfterStart ) ) ;
956
+ }
957
+
958
+ // We have 1-64 characters remaining. Process the first and last vector in the search space.
959
+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
960
+ Debug . Assert ( searchSpaceLength >= Vector512 < short > . Count , "We expect that the input is long enough for us to load a whole vector." ) ;
961
+ {
962
+ ref short oneVectorAfterStart = ref Unsafe . Add ( ref searchSpace , Vector512 < short > . Count ) ;
963
+
964
+ ref short secondVector = ref Unsafe . IsAddressGreaterThan ( ref currentSearchSpace , ref oneVectorAfterStart )
965
+ ? ref Unsafe . Subtract ( ref currentSearchSpace , Vector512 < short > . Count )
966
+ : ref searchSpace ;
967
+
968
+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref searchSpace ) ;
969
+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref secondVector ) ;
970
+
971
+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
972
+ if ( result != Vector512 < byte > . Zero )
973
+ {
974
+ return ComputeLastIndexOverlapped < short , TNegator > ( ref searchSpace , ref secondVector , result ) ;
975
+ }
976
+ }
977
+
978
+ return - 1 ;
979
+ }
980
+
981
+ Vector256 < byte > asciiBitmap256 = state . AsciiBitmap256 ( ) ;
982
+ Vector256 < byte > secondBitmap256 = state . SecondBitmap256 ( ) ;
855
983
Vector256 < ushort > offset256 = Vector256 . Create ( state . Offset ) ;
856
984
857
- if ( searchSpaceLength > 2 * Vector256 < short > . Count )
985
+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer
986
+ if ( ! ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported ) && searchSpaceLength > 2 * Vector256 < short > . Count )
987
+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing
858
988
{
859
989
// Process the input in chunks of 32 characters (2 * Vector256<short>).
860
990
// We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
@@ -903,8 +1033,8 @@ public static int LastIndexOfAny<TNegator, TOptimizations>(ref short searchSpace
903
1033
return - 1 ;
904
1034
}
905
1035
906
- Vector128 < byte > asciiBitmap = state . AsciiBitmap . _lower ;
907
- Vector128 < byte > secondBitmap = state . SecondBitmap . _lower ;
1036
+ Vector128 < byte > asciiBitmap = state . AsciiBitmap128 ( ) ;
1037
+ Vector128 < byte > secondBitmap = state . SecondBitmap128 ( ) ;
908
1038
Vector128 < ushort > offset = Vector128 . Create ( state . Offset ) ;
909
1039
910
1040
if ( ! Avx2 . IsSupported && searchSpaceLength > 2 * Vector128 < short > . Count )
@@ -1845,6 +1975,23 @@ private static Vector512<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector
1845
1975
return TNegator . NegateIfNeeded ( result ) ;
1846
1976
}
1847
1977
1978
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
1979
+ [ CompExactlyDependsOn ( typeof ( Avx512BW ) ) ]
1980
+ private static Vector512 < byte > IndexOfAnyLookup < TNegator , TOptimizations > ( Vector512 < short > source0 , Vector512 < short > source1 , Vector512 < byte > bitmapLookup0 , Vector512 < byte > bitmapLookup1 , Vector512 < ushort > offset )
1981
+ where TNegator : struct , INegator
1982
+ where TOptimizations : struct , IOptimizations
1983
+ {
1984
+ Debug . Assert ( ( bitmapLookup1 [ 0 ] & 1 ) == 0 , "The 0th bit in second bitmap shouldn't be set." ) ;
1985
+
1986
+ Vector512 < byte > packed0 = TOptimizations . PackSources ( source0 . AsUInt16 ( ) , source1 . AsUInt16 ( ) ) ;
1987
+ Vector512 < byte > packed1 = Default . PackSources ( source0 . AsUInt16 ( ) - offset , source1 . AsUInt16 ( ) - offset ) ;
1988
+
1989
+ Vector512 < byte > result0 = IndexOfAnyLookupCore ( packed0 , bitmapLookup0 ) ;
1990
+ Vector512 < byte > result1 = IndexOfAnyLookupCore ( packed1 , bitmapLookup1 ) ;
1991
+
1992
+ return TNegator . NegateIfNeeded ( result0 | result1 ) ;
1993
+ }
1994
+
1848
1995
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
1849
1996
[ CompExactlyDependsOn ( typeof ( Avx512BW ) ) ]
1850
1997
private static Vector512 < byte > IndexOfAnyLookupCore ( Vector512 < byte > source , Vector512 < byte > bitmapLookup )
0 commit comments