Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

Merged
merged 1 commit into from
Jul 17, 2023

Conversation

MihaZupan
Copy link
Member

@MihaZupan MihaZupan commented May 27, 2023

Contributes to #82564

(First time touching the JIT so I don't really know what I'm doing)

Vector128<byte> v0;

// Expands patterns like
v0 >>> 4;
// to
(v0.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)15);

// and
v0 >>> nonConstAmount;
// to
int maskedShiftAmount = nonConstAmount & 7;
Vector128<int> shiftVector = Sse2.ConvertScalarToVector128Int32(maskedShiftAmount);
Vector128<byte> shiftedInput = Sse2.ShiftRightLogical(v0.AsInt32(), shiftVector).AsByte();
return shiftedInput & Vector128.Create((byte)(255 >> maskedShiftAmount));

Codegen for

[Benchmark]
[Arguments(42)]
public Vector128<byte> SimpleShift(byte input) =>
    Vector128.Create(input) >>> 4;
Before
; VectorTest.SimpleShift(Byte)
       sub       rsp,48
       vzeroupper
       movzx     eax,r8b
       vpbroadcastb xmm0,eax
       vmovaps   [rsp+30],xmm0
       mov       rax,[rsp+30]
       mov       [rsp+20],rax
       movzx     eax,byte ptr [rsp+20]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+28],al
       movzx     eax,byte ptr [rsp+21]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+29],al
       movzx     eax,byte ptr [rsp+22]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2A],al
       movzx     eax,byte ptr [rsp+23]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2B],al
       movzx     eax,byte ptr [rsp+24]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2C],al
       movzx     eax,byte ptr [rsp+25]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2D],al
       movzx     eax,byte ptr [rsp+26]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2E],al
       movzx     eax,byte ptr [rsp+27]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2F],al
       mov       rax,[rsp+28]
       mov       rcx,[rsp+38]
       mov       [rsp+10],rcx
       movzx     ecx,byte ptr [rsp+10]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+18],cl
       movzx     ecx,byte ptr [rsp+11]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+19],cl
       movzx     ecx,byte ptr [rsp+12]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1A],cl
       movzx     ecx,byte ptr [rsp+13]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1B],cl
       movzx     ecx,byte ptr [rsp+14]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1C],cl
       movzx     ecx,byte ptr [rsp+15]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1D],cl
       movzx     ecx,byte ptr [rsp+16]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1E],cl
       movzx     ecx,byte ptr [rsp+17]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1F],cl
       mov       rcx,[rsp+18]
       mov       [rsp],rax
       mov       [rsp+8],rcx
       vmovaps   xmm0,[rsp]
       vmovups   [rdx],xmm0
       mov       rax,rdx
       add       rsp,48
       ret
; Total bytes of code 319
After
; VectorTest.SimpleShift(Byte)
       vzeroupper
       movzx     eax,r8b
       vpbroadcastb xmm0,eax
       vpsrld    xmm0,xmm0,4
       vpand     xmm0,xmm0,[7FFC6F58A2E0]
       vmovups   [rdx],xmm0
       mov       rax,rdx
       ret
; Total bytes of code 34

Codegen for

[Benchmark]
[Arguments(3)]
public Vector128<byte> SimpleShift(int shiftCount) =>
    Vector128.Create((byte)42) >>> shiftCount;
Before
; VectorTest.SimpleShift(Int32)
       sub       rsp,58
       vzeroupper
       vmovups   xmm0,[7FFC74E11BB0]
       vmovaps   [rsp+40],xmm0
       mov       rax,[rsp+40]
       mov       [rsp+20],rax
       xor       eax,eax
       mov       ecx,r8d
       and       ecx,7
M00_L00:
       lea       r9,[rsp+20]
       movsxd    r10,eax
       movzx     r9d,byte ptr [r9+r10]
       shrx      r9d,r9d,ecx
       movzx     r9d,r9b
       lea       r11,[rsp+28]
       mov       [r11+r10],r9b
       inc       eax
       cmp       eax,8
       jl        short M00_L00
       vmovaps   [rsp+30],xmm0
       mov       rax,[rsp+28]
       mov       rcx,[rsp+38]
       mov       [rsp+10],rcx
       xor       ecx,ecx
       and       r8d,7
M00_L01:
       lea       r9,[rsp+10]
       movsxd    r10,ecx
       movzx     r9d,byte ptr [r9+r10]
       shrx      r9d,r9d,r8d
       movzx     r9d,r9b
       lea       r11,[rsp+18]
       mov       [r11+r10],r9b
       inc       ecx
       cmp       ecx,8
       jl        short M00_L01
       mov       rcx,[rsp+18]
       mov       [rsp],rax
       mov       [rsp+8],rcx
       vmovaps   xmm0,[rsp]
       vmovups   [rdx],xmm0
       mov       rax,rdx
       add       rsp,58
       ret
; Total bytes of code 173
After
; VectorTest.SimpleShift(Int32)
       vzeroupper
       and       r8d,7
       vmovd     xmm0,r8d
       vmovups   xmm1,[7FFCEB69A330]
       vpsrld    xmm0,xmm1,xmm0
       mov       eax,0FF
       shrx      eax,eax,r8d
       vpbroadcastb xmm1,eax
       vpand     xmm0,xmm0,xmm1
       vmovups   [rdx],xmm0
       mov       rax,rdx
       ret
; Total bytes of code 52

@MihaZupan MihaZupan added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label May 27, 2023
@MihaZupan MihaZupan added this to the 8.0.0 milestone May 27, 2023
@MihaZupan MihaZupan self-assigned this May 27, 2023
@ghost
Copy link

ghost commented May 27, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Contributes to #82564

(First time touching the JIT so I don't really know what I'm doing)

Vector128<byte> v0;

// Expands patterns like
v0 >>> 4;
// to
(v0.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)15);
Author: MihaZupan
Assignees: MihaZupan
Labels:

area-CodeGen-coreclr

Milestone: 8.0.0

Comment on lines 2285 to 2289
if (varTypeIsByte(simdBaseType) && !impStackTop(0).val->IsCnsIntOrI())
{
// byte and sbyte would require more work to support
// non-constant byte and sbyte would require more work to support
break;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't need this limitation. For shift in particular, working with constant or non-constant data is the same since we have overloads that take byte shiftAmount and overloads that take Vector128<T> shiftAmount with the latter simply being Vector128.CreateScalarUnsafe(shiftAmount) of the non-constant byte.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That handling of constant vs non-constant is general and simply part of the existing gtNewSimdBinOp handling, so you shouldn't need to do anything additional really.

We'll already have appropriately masked off the shift amount to be 0-7 and converted it to a Vector128 if necessary; so you should just need to generate the shift followed by the and to mask off the upper n-bits of each element.

It basically just becomes that if its a constant, you can generate a new icon directly and otherwise the mask is gtNewSimdBroadcastNode of 0xFF >> shiftAmount

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated it now to handle non-const shift amounts as well, but it feels a bit hacky with the temporary nonConstantByteShiftCountOp - please let me know if I'm on the right track.

Right now the non-const path is something like

static Vector128<byte> Shift(Vector128<byte> input, int shiftAmount)
{
    int maskedShiftAmount = shiftAmount & 7;
    Vector128<int> shiftVector = Sse2.ConvertScalarToVector128Int32(maskedShiftAmount);
    Vector128<byte> shiftedInput = Sse2.ShiftRightLogical(input.AsInt32(), shiftVector).AsByte();
    return shiftedInput & Vector128.Create((byte)(255 >> maskedShiftAmount));
}

@MihaZupan
Copy link
Member Author

MihaZupan commented Jul 3, 2023

(don't mind the referenced issue spam - I'm just testing stuff and this PR is convenient)

@tannergooding
Copy link
Member

CC. @dotnet/jit-contrib, this needs a secondary review before merging.

@MihaZupan
Copy link
Member Author

Test failures were #88582

@MihaZupan MihaZupan merged commit 89d435c into dotnet:main Jul 17, 2023
MihaZupan added a commit to MihaZupan/runtime that referenced this pull request Jul 23, 2023
MihaZupan added a commit to MihaZupan/runtime that referenced this pull request Aug 7, 2023
@ghost ghost locked as resolved and limited conversation to collaborators Aug 16, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants