Skip to content

Commit fa1164c

Browse files
Use multi-reg load/store for DecodeFromUTF8 (#100589)
* Use multi-reg load/store for DecodeFromUTF8 * Address review comments
1 parent 2e8f818 commit fa1164c

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed

src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs

+112
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp
9090
}
9191
}
9292

93+
end = srcMax - 66;
94+
if (AdvSimd.Arm64.IsSupported && (end >= src))
95+
{
96+
AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
97+
98+
if (src == srcEnd)
99+
{
100+
goto DoneExit;
101+
}
102+
}
103+
93104
end = srcMax - 24;
94105
if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src))
95106
{
@@ -844,6 +855,107 @@ private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte>
844855
return Vector128.ShuffleUnsafe(left, right);
845856
}
846857

858+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
859+
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
860+
private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
861+
{
862+
// C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c
863+
// If we have AdvSimd support, pick off 64 bytes at a time for as long as we can,
864+
// but make sure that we quit before seeing any == markers at the end of the
865+
// string. 64 + 2 = 66 bytes.
866+
867+
Vector128<byte> decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
868+
Vector128<byte> decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
869+
Vector128<byte> decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte();
870+
Vector128<byte> decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte();
871+
Vector128<byte> decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte();
872+
Vector128<byte> decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte();
873+
Vector128<byte> decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte();
874+
Vector128<byte> decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte();
875+
876+
Vector128<byte> decOne1;
877+
Vector128<byte> decOne2;
878+
Vector128<byte> decOne3;
879+
Vector128<byte> decOne4;
880+
Vector128<byte> decTwo1;
881+
Vector128<byte> decTwo2;
882+
Vector128<byte> decTwo3;
883+
Vector128<byte> decTwo4;
884+
Vector128<byte> str1;
885+
Vector128<byte> str2;
886+
Vector128<byte> str3;
887+
Vector128<byte> str4;
888+
Vector128<byte> res1;
889+
Vector128<byte> res2;
890+
Vector128<byte> res3;
891+
892+
byte* src = srcBytes;
893+
byte* dest = destBytes;
894+
Vector128<byte> offset = AdvSimd.DuplicateToVector128((byte)0x3F);
895+
var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4);
896+
var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4);
897+
898+
do
899+
{
900+
// Load 64 bytes and de-interleave.
901+
AssertRead<Vector128<byte>>(src, srcStart, sourceLength);
902+
(str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src);
903+
904+
// Get indices for second LUT:
905+
decTwo1 = AdvSimd.SubtractSaturate(str1, offset);
906+
decTwo2 = AdvSimd.SubtractSaturate(str2, offset);
907+
decTwo3 = AdvSimd.SubtractSaturate(str3, offset);
908+
decTwo4 = AdvSimd.SubtractSaturate(str4, offset);
909+
910+
// Get values from first LUT. Out-of-range indices are set to 0.
911+
decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1);
912+
decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2);
913+
decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3);
914+
decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4);
915+
916+
// Get values from second LUT. Out-of-range indices are unchanged.
917+
decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1);
918+
decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2);
919+
decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3);
920+
decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4);
921+
922+
// Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'.
923+
// Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values.
924+
str1 = decOne1 | decTwo1;
925+
str2 = decOne2 | decTwo2;
926+
str3 = decOne3 | decTwo3;
927+
str4 = decOne4 | decTwo4;
928+
929+
// Check for invalid input, any value larger than 63.
930+
Vector128<byte> classified = AdvSimd.CompareGreaterThan(str1, offset)
931+
| AdvSimd.CompareGreaterThan(str2, offset)
932+
| AdvSimd.CompareGreaterThan(str3, offset)
933+
| AdvSimd.CompareGreaterThan(str4, offset);
934+
935+
// Check that all bits are zero.
936+
if (classified != Vector128<byte>.Zero)
937+
{
938+
break;
939+
}
940+
941+
// Compress four bytes into three.
942+
res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4);
943+
res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2);
944+
res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4;
945+
946+
// Interleave and store decoded result.
947+
AssertWrite<Vector128<byte>>(dest, destStart, destLength);
948+
AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3));
949+
950+
src += 64;
951+
dest += 48;
952+
}
953+
while (src <= srcEnd);
954+
955+
srcBytes = src;
956+
destBytes = dest;
957+
}
958+
847959
[MethodImpl(MethodImplOptions.AggressiveInlining)]
848960
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
849961
[CompExactlyDependsOn(typeof(Ssse3))]

0 commit comments

Comments
 (0)