From a7931f5a72cb01562df50161221dbf5f75361bb1 Mon Sep 17 00:00:00 2001 From: Swapnil Gaikwad Date: Mon, 25 Mar 2024 12:38:19 +0000 Subject: [PATCH 1/2] Use multi-reg load/store for DecodeFromUTF8 --- .../src/System/Buffers/Text/Base64Decoder.cs | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index 1ad2cf9faa9f3..af1ecdc1f2250 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Sp } } + end = srcMax - 66; + if (AdvSimd.Arm64.IsSupported && (end >= src)) + { + AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + { + goto DoneExit; + } + } + end = srcMax - 24; if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src)) { @@ -844,6 +855,105 @@ private static Vector128 SimdShuffle(Vector128 left, Vector128 return Vector128.ShuffleUnsafe(left, right); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) + { + // C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c + // If we have AdvSimd support, pick off 64 bytes at a time for as long as we can, + // but make sure that we quit before seeing any == markers at the end of the + // string. 64 + 2 = 66 bytes. + + Vector128 decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); + Vector128 decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); + Vector128 decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(); + Vector128 decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte(); + Vector128 decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(); + Vector128 decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(); + Vector128 decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(); + Vector128 decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte(); + + Vector128 decOne1; + Vector128 decOne2; + Vector128 decOne3; + Vector128 decOne4; + Vector128 decTwo1; + Vector128 decTwo2; + Vector128 decTwo3; + Vector128 decTwo4; + Vector128 str1; + Vector128 str2; + Vector128 str3; + Vector128 str4; + Vector128 res1; + Vector128 res2; + Vector128 res3; + + byte* src = srcBytes; + byte* dest = destBytes; + Vector128 offset = AdvSimd.DuplicateToVector128((byte)0x3F); + + do + { + // Load 64 bytes and de-interleave. + AssertRead>(src, srcStart, sourceLength); + (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); + + // Get indices for second LUT: + decTwo1 = AdvSimd.SubtractSaturate(str1, offset); + decTwo2 = AdvSimd.SubtractSaturate(str2, offset); + decTwo3 = AdvSimd.SubtractSaturate(str3, offset); + decTwo4 = AdvSimd.SubtractSaturate(str4, offset); + + // Get values from first LUT. Out-of-range indices are set to 0. + decOne1 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str1); + decOne2 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str2); + decOne3 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str3); + decOne4 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str4); + + // Get values from second LUT. Out-of-range indices are unchanged. + decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo1); + decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo2); + decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo3); + decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo4); + + // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'. + // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values. + str1 = AdvSimd.Or(decOne1, decTwo1); + str2 = AdvSimd.Or(decOne2, decTwo2); + str3 = AdvSimd.Or(decOne3, decTwo3); + str4 = AdvSimd.Or(decOne4, decTwo4); + + // Check for invalid input, any value larger than 63. + Vector128 classified = AdvSimd.CompareGreaterThan(str1, offset) + | AdvSimd.CompareGreaterThan(str2, offset) + | AdvSimd.CompareGreaterThan(str3, offset) + | AdvSimd.CompareGreaterThan(str4, offset); + + // Check that all bits are zero. + if (classified != Vector128.Zero) + { + break; + } + + // Compress four bytes into three. + res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4); + res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2); + res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4; + + // Interleave and store decoded result. + AssertWrite>(dest, destStart, destLength); + AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3)); + + src += 64; + dest += 48; + } + while (src <= srcEnd); + + srcBytes = src; + destBytes = dest; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Ssse3))] From 1ab9bcf3b39d8a88af85af2239718a681d92f03a Mon Sep 17 00:00:00 2001 From: Swapnil Gaikwad Date: Tue, 9 Apr 2024 17:47:56 +0100 Subject: [PATCH 2/2] Address review comments --- .../src/System/Buffers/Text/Base64Decoder.cs | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index af1ecdc1f2250..16ff227379d1a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -892,12 +892,14 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes byte* src = srcBytes; byte* dest = destBytes; Vector128 offset = AdvSimd.DuplicateToVector128((byte)0x3F); + var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4); + var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4); do { // Load 64 bytes and de-interleave. AssertRead>(src, srcStart, sourceLength); - (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); + (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); // Get indices for second LUT: decTwo1 = AdvSimd.SubtractSaturate(str1, offset); @@ -906,23 +908,23 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes decTwo4 = AdvSimd.SubtractSaturate(str4, offset); // Get values from first LUT. Out-of-range indices are set to 0. - decOne1 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str1); - decOne2 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str2); - decOne3 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str3); - decOne4 = AdvSimd.Arm64.VectorTableLookup((decLutOne1, decLutOne2, decLutOne3, decLutOne4), str4); + decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1); + decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2); + decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3); + decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4); // Get values from second LUT. Out-of-range indices are unchanged. - decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo1); - decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo2); - decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo3); - decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4), decTwo4); + decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1); + decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2); + decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3); + decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4); // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'. // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values. - str1 = AdvSimd.Or(decOne1, decTwo1); - str2 = AdvSimd.Or(decOne2, decTwo2); - str3 = AdvSimd.Or(decOne3, decTwo3); - str4 = AdvSimd.Or(decOne4, decTwo4); + str1 = decOne1 | decTwo1; + str2 = decOne2 | decTwo2; + str3 = decOne3 | decTwo3; + str4 = decOne4 | decTwo4; // Check for invalid input, any value larger than 63. Vector128 classified = AdvSimd.CompareGreaterThan(str1, offset)