@@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp
90
90
}
91
91
}
92
92
93
+ end = srcMax - 66 ;
94
+ if ( AdvSimd . Arm64 . IsSupported && ( end >= src ) )
95
+ {
96
+ AdvSimdDecode ( ref src , ref dest , end , maxSrcLength , destLength , srcBytes , destBytes ) ;
97
+
98
+ if ( src == srcEnd )
99
+ {
100
+ goto DoneExit ;
101
+ }
102
+ }
103
+
93
104
end = srcMax - 24 ;
94
105
if ( ( Ssse3 . IsSupported || AdvSimd . Arm64 . IsSupported ) && BitConverter . IsLittleEndian && ( end >= src ) )
95
106
{
@@ -844,6 +855,107 @@ private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte>
844
855
return Vector128 . ShuffleUnsafe ( left , right ) ;
845
856
}
846
857
858
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
859
+ [ CompExactlyDependsOn ( typeof ( AdvSimd . Arm64 ) ) ]
860
+ private static unsafe void AdvSimdDecode ( ref byte * srcBytes , ref byte * destBytes , byte * srcEnd , int sourceLength , int destLength , byte * srcStart , byte * destStart )
861
+ {
862
+ // C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c
863
+ // If we have AdvSimd support, pick off 64 bytes at a time for as long as we can,
864
+ // but make sure that we quit before seeing any == markers at the end of the
865
+ // string. 64 + 2 = 66 bytes.
866
+
867
+ Vector128 < byte > decLutOne1 = Vector128 . Create ( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF ) . AsByte ( ) ;
868
+ Vector128 < byte > decLutOne2 = Vector128 . Create ( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF ) . AsByte ( ) ;
869
+ Vector128 < byte > decLutOne3 = Vector128 . Create ( 0xFFFFFFFF , 0xFFFFFFFF , 0x3EFFFFFF , 0x3FFFFFFF ) . AsByte ( ) ;
870
+ Vector128 < byte > decLutOne4 = Vector128 . Create ( 0x37363534 , 0x3B3A3938 , 0xFFFF3D3C , 0xFFFFFFFF ) . AsByte ( ) ;
871
+ Vector128 < byte > decLutTwo1 = Vector128 . Create ( 0x0100FF00 , 0x05040302 , 0x09080706 , 0x0D0C0B0A ) . AsByte ( ) ;
872
+ Vector128 < byte > decLutTwo2 = Vector128 . Create ( 0x11100F0E , 0x15141312 , 0x19181716 , 0xFFFFFFFF ) . AsByte ( ) ;
873
+ Vector128 < byte > decLutTwo3 = Vector128 . Create ( 0x1B1AFFFF , 0x1F1E1D1C , 0x23222120 , 0x27262524 ) . AsByte ( ) ;
874
+ Vector128 < byte > decLutTwo4 = Vector128 . Create ( 0x2B2A2928 , 0x2F2E2D2C , 0x33323130 , 0xFFFFFFFF ) . AsByte ( ) ;
875
+
876
+ Vector128 < byte > decOne1 ;
877
+ Vector128 < byte > decOne2 ;
878
+ Vector128 < byte > decOne3 ;
879
+ Vector128 < byte > decOne4 ;
880
+ Vector128 < byte > decTwo1 ;
881
+ Vector128 < byte > decTwo2 ;
882
+ Vector128 < byte > decTwo3 ;
883
+ Vector128 < byte > decTwo4 ;
884
+ Vector128 < byte > str1 ;
885
+ Vector128 < byte > str2 ;
886
+ Vector128 < byte > str3 ;
887
+ Vector128 < byte > str4 ;
888
+ Vector128 < byte > res1 ;
889
+ Vector128 < byte > res2 ;
890
+ Vector128 < byte > res3 ;
891
+
892
+ byte * src = srcBytes ;
893
+ byte * dest = destBytes ;
894
+ Vector128 < byte > offset = AdvSimd . DuplicateToVector128 ( ( byte ) 0x3F ) ;
895
+ var decLutOne = ( decLutOne1 , decLutOne2 , decLutOne3 , decLutOne4 ) ;
896
+ var decLutTwo = ( decLutTwo1 , decLutTwo2 , decLutTwo3 , decLutTwo4 ) ;
897
+
898
+ do
899
+ {
900
+ // Load 64 bytes and de-interleave.
901
+ AssertRead < Vector128 < byte > > ( src , srcStart , sourceLength ) ;
902
+ ( str1 , str2 , str3 , str4 ) = AdvSimd . Arm64 . LoadVector128x4AndUnzip ( src ) ;
903
+
904
+ // Get indices for second LUT:
905
+ decTwo1 = AdvSimd . SubtractSaturate ( str1 , offset ) ;
906
+ decTwo2 = AdvSimd . SubtractSaturate ( str2 , offset ) ;
907
+ decTwo3 = AdvSimd . SubtractSaturate ( str3 , offset ) ;
908
+ decTwo4 = AdvSimd . SubtractSaturate ( str4 , offset ) ;
909
+
910
+ // Get values from first LUT. Out-of-range indices are set to 0.
911
+ decOne1 = AdvSimd . Arm64 . VectorTableLookup ( decLutOne , str1 ) ;
912
+ decOne2 = AdvSimd . Arm64 . VectorTableLookup ( decLutOne , str2 ) ;
913
+ decOne3 = AdvSimd . Arm64 . VectorTableLookup ( decLutOne , str3 ) ;
914
+ decOne4 = AdvSimd . Arm64 . VectorTableLookup ( decLutOne , str4 ) ;
915
+
916
+ // Get values from second LUT. Out-of-range indices are unchanged.
917
+ decTwo1 = AdvSimd . Arm64 . VectorTableLookupExtension ( decTwo1 , decLutTwo , decTwo1 ) ;
918
+ decTwo2 = AdvSimd . Arm64 . VectorTableLookupExtension ( decTwo2 , decLutTwo , decTwo2 ) ;
919
+ decTwo3 = AdvSimd . Arm64 . VectorTableLookupExtension ( decTwo3 , decLutTwo , decTwo3 ) ;
920
+ decTwo4 = AdvSimd . Arm64 . VectorTableLookupExtension ( decTwo4 , decLutTwo , decTwo4 ) ;
921
+
922
+ // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'.
923
+ // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values.
924
+ str1 = decOne1 | decTwo1 ;
925
+ str2 = decOne2 | decTwo2 ;
926
+ str3 = decOne3 | decTwo3 ;
927
+ str4 = decOne4 | decTwo4 ;
928
+
929
+ // Check for invalid input, any value larger than 63.
930
+ Vector128 < byte > classified = AdvSimd . CompareGreaterThan ( str1 , offset )
931
+ | AdvSimd . CompareGreaterThan ( str2 , offset )
932
+ | AdvSimd . CompareGreaterThan ( str3 , offset )
933
+ | AdvSimd . CompareGreaterThan ( str4 , offset ) ;
934
+
935
+ // Check that all bits are zero.
936
+ if ( classified != Vector128 < byte > . Zero )
937
+ {
938
+ break ;
939
+ }
940
+
941
+ // Compress four bytes into three.
942
+ res1 = AdvSimd . ShiftLeftLogical ( str1 , 2 ) | AdvSimd . ShiftRightLogical ( str2 , 4 ) ;
943
+ res2 = AdvSimd . ShiftLeftLogical ( str2 , 4 ) | AdvSimd . ShiftRightLogical ( str3 , 2 ) ;
944
+ res3 = AdvSimd . ShiftLeftLogical ( str3 , 6 ) | str4 ;
945
+
946
+ // Interleave and store decoded result.
947
+ AssertWrite < Vector128 < byte > > ( dest , destStart , destLength ) ;
948
+ AdvSimd . Arm64 . StoreVector128x3AndZip ( dest , ( res1 , res2 , res3 ) ) ;
949
+
950
+ src += 64 ;
951
+ dest += 48 ;
952
+ }
953
+ while ( src <= srcEnd ) ;
954
+
955
+ srcBytes = src ;
956
+ destBytes = dest ;
957
+ }
958
+
847
959
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
848
960
[ CompExactlyDependsOn ( typeof ( AdvSimd . Arm64 ) ) ]
849
961
[ CompExactlyDependsOn ( typeof ( Ssse3 ) ) ]
0 commit comments