-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use multi-reg load/store for DecodeFromUTF8 #100589
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp | |
} | ||
} | ||
|
||
end = srcMax - 66; | ||
if (AdvSimd.Arm64.IsSupported && (end >= src)) | ||
{ | ||
AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); | ||
|
||
if (src == srcEnd) | ||
{ | ||
goto DoneExit; | ||
} | ||
} | ||
|
||
end = srcMax - 24; | ||
if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src)) | ||
{ | ||
|
@@ -844,6 +855,107 @@ private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte> | |
return Vector128.ShuffleUnsafe(left, right); | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))] | ||
private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) | ||
{ | ||
// C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c | ||
// If we have AdvSimd support, pick off 64 bytes at a time for as long as we can, | ||
// but make sure that we quit before seeing any == markers at the end of the | ||
// string. 64 + 2 = 66 bytes. | ||
|
||
Vector128<byte> decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); | ||
Vector128<byte> decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); | ||
Vector128<byte> decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(); | ||
Vector128<byte> decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte(); | ||
Vector128<byte> decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(); | ||
Vector128<byte> decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(); | ||
Vector128<byte> decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(); | ||
Vector128<byte> decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte(); | ||
Comment on lines
+867
to
+874
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear why these all need to be declared up front. It's effectively defining unnecessary locals and work for the JIT to do. This could have been |
||
|
||
Vector128<byte> decOne1; | ||
Vector128<byte> decOne2; | ||
Vector128<byte> decOne3; | ||
Vector128<byte> decOne4; | ||
Vector128<byte> decTwo1; | ||
Vector128<byte> decTwo2; | ||
Vector128<byte> decTwo3; | ||
Vector128<byte> decTwo4; | ||
Vector128<byte> str1; | ||
Vector128<byte> str2; | ||
Vector128<byte> str3; | ||
Vector128<byte> str4; | ||
Vector128<byte> res1; | ||
Vector128<byte> res2; | ||
Vector128<byte> res3; | ||
Comment on lines
+876
to
+890
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar here, it's not clear why these were defined up front like this. They could have been declared in the loop at the declaration site (since they aren't used outside the loop) making the overall code much more readable, without needing 15 up front declarations |
||
|
||
byte* src = srcBytes; | ||
byte* dest = destBytes; | ||
Vector128<byte> offset = AdvSimd.DuplicateToVector128((byte)0x3F); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The former allows more general optimizations to kick in as it does not prescribe a particular instruction be emitted. |
||
var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4); | ||
var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4); | ||
|
||
do | ||
{ | ||
// Load 64 bytes and de-interleave. | ||
AssertRead<Vector128<byte>>(src, srcStart, sourceLength); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know this is pre-existing to the PR, but this is not a great method name. It should probably be something like |
||
(str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you mean a wrapper method, say There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, rather as per my comment above (#100589 (comment)) we prefer to write xplat code so logic can be shared across CPU architectures where possible. To facilitate this, we ideally have the JIT recognize simple patterns and optimize them.
The driving question tends to be, "does this logic actually have to be platform/ISA specific, or is there a simple pattern we could enable the JIT to recognize instead?". In the case simple pattern recognition is possible, then it's generally ideal to enable it so that existing user code lights up and gets faster on Arm64 without the need for users to go and manually do the work. It also lets the same code automatically light-up for things like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, thanks for this clarification. I'll keep this mind for the common patterns in the future. |
||
|
||
// Get indices for second LUT: | ||
decTwo1 = AdvSimd.SubtractSaturate(str1, offset); | ||
decTwo2 = AdvSimd.SubtractSaturate(str2, offset); | ||
decTwo3 = AdvSimd.SubtractSaturate(str3, offset); | ||
decTwo4 = AdvSimd.SubtractSaturate(str4, offset); | ||
|
||
// Get values from first LUT. Out-of-range indices are set to 0. | ||
decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1); | ||
decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2); | ||
decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3); | ||
decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4); | ||
|
||
// Get values from second LUT. Out-of-range indices are unchanged. | ||
decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1); | ||
decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2); | ||
decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3); | ||
decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4); | ||
|
||
// Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'. | ||
// Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values. | ||
str1 = decOne1 | decTwo1; | ||
str2 = decOne2 | decTwo2; | ||
str3 = decOne3 | decTwo3; | ||
str4 = decOne4 | decTwo4; | ||
Comment on lines
+904
to
+927
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This general algorithm probably could do with a comment walking through what's happening so the "magic" is better understood, much as exists for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, it's difficult to follow. Although, it's a bit tricky to describe the entire workings with lookup tables and interleaving. I'll try to make it less opaque. |
||
|
||
// Check for invalid input, any value larger than 63. | ||
Vector128<byte> classified = AdvSimd.CompareGreaterThan(str1, offset) | ||
| AdvSimd.CompareGreaterThan(str2, offset) | ||
| AdvSimd.CompareGreaterThan(str3, offset) | ||
| AdvSimd.CompareGreaterThan(str4, offset); | ||
Comment on lines
+930
to
+933
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These could have been the xplat There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't understand the xplat bit. Could you elaborate on this, please? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have APIs defined on Using these APIs tends to be preferred over using the platform specific APIs, particularly when there is a clear 1-to-1 mapping. Thus Using the xplat (cross platform) APIs gives the JIT more flexibility in the code it generates and allows more portability, while also often improving readability. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense, I'll update these calls 👍 |
||
|
||
// Check that all bits are zero. | ||
if (classified != Vector128<byte>.Zero) | ||
{ | ||
break; | ||
} | ||
|
||
// Compress four bytes into three. | ||
res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4); | ||
res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2); | ||
res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4; | ||
Comment on lines
+942
to
+944
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These could have been of the form |
||
|
||
// Interleave and store decoded result. | ||
AssertWrite<Vector128<byte>>(dest, destStart, destLength); | ||
AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3)); | ||
|
||
src += 64; | ||
dest += 48; | ||
} | ||
while (src <= srcEnd); | ||
|
||
srcBytes = src; | ||
destBytes = dest; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))] | ||
[CompExactlyDependsOn(typeof(Ssse3))] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These could have been
Vector128<byte>.AllBitsSet