Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Investigate adjusting herustics for unrolled block copies/initialization #82529

Closed
am11 opened this issue Feb 23, 2023 · 11 comments · Fixed by #83274
Closed

Investigate adjusting herustics for unrolled block copies/initialization #82529

am11 opened this issue Feb 23, 2023 · 11 comments · Fixed by #83274
Assignees
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone

Comments

@am11
Copy link
Member

am11 commented Feb 23, 2023

Zero initialization becomes expensive with two bytes struct fields are involved:

Given:

unsafe struct S1
{
    fixed byte a[10];
    int b;
    fixed byte c[23];
    fixed byte d[24];
    fixed byte e[25];
}

unsafe struct S2
{
    fixed short a[10];
    int b;
    fixed short c[23];
    fixed short d[24];
    fixed short e[25];
}

Unlike S1, initializing S2 does not inline the CORINFO_HELP_MEMSET call and fail to make use of vectors:

;   S1 X1() { S1 s = default; return s; }

C:X1():S1:this:
       sub      rsp, 24
       vzeroupper 
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+10H], rax
       xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  xmmword ptr [rsi+40H], xmm0
       mov      qword ptr [rsi+50H], rax
       mov      rax, rsi
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+10H], rdi
       je       SHORT G_M61359_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M61359_IG03:
       nop      
       add      rsp, 24
       ret      

;   S2 X2() { S2 s = default; return s; }

C:X2():S2:this:
       push     rbx
       sub      rsp, 16
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+08H], rax
       mov      rbx, rsi
       xor      esi, esi
       mov      rdi, rbx
       mov      edx, 168
       call     [CORINFO_HELP_MEMSET]
       mov      rax, rbx
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+08H], rdi
       je       SHORT G_M46095_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M46095_IG03:
       nop      
       add      rsp, 16
       pop      rbx
       ret      

The codegen can try to match that of C++ compiler:

X2():                                  # @X2()
        mov     rax, rdi
        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 128], ymm0
        vmovups ymmword ptr [rdi + 96], ymm0
        vmovups ymmword ptr [rdi + 64], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0
        mov     qword ptr [rdi + 160], 0
        vzeroupper
        ret

https://godbolt.org/z/a3WsPdd4M

@ghost ghost added the untriaged New issue has not been triaged by the area owner label Feb 23, 2023
@am11 am11 added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 23, 2023
@ghost
Copy link

ghost commented Feb 23, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak
See info in area-owners.md if you want to be subscribed.

Issue Details

Zero initialization becomes expensive with two bytes struct fields are involved:

Given:

unsafe struct S1
{
    fixed byte a[10];
    int b;
    fixed byte c[23];
    fixed byte d[24];
    fixed byte e[25];
}

unsafe struct S2
{
    fixed short a[10];
    int b;
    fixed short c[23];
    fixed short d[24];
    fixed short e[25];
}

Unlike S1, initializing S2 does not inline the CORINFO_HELP_MEMSET call and fail to make use of vectors:

;   S1 X1() { S1 s = default; return s; }

C:X1():S1:this:
       sub      rsp, 24
       vzeroupper 
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+10H], rax
       xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  xmmword ptr [rsi+40H], xmm0
       mov      qword ptr [rsi+50H], rax
       mov      rax, rsi
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+10H], rdi
       je       SHORT G_M61359_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M61359_IG03:
       nop      
       add      rsp, 24
       ret      

;   S2 X2() { S2 s = default; return s; }

C:X2():S2:this:
       push     rbx
       sub      rsp, 16
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+08H], rax
       mov      rbx, rsi
       xor      esi, esi
       mov      rdi, rbx
       mov      edx, 168
       call     [CORINFO_HELP_MEMSET]
       mov      rax, rbx
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+08H], rdi
       je       SHORT G_M46095_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M46095_IG03:
       nop      
       add      rsp, 16
       pop      rbx
       ret      

The codegen can try to match that of C++ compiler:

X2():                                  # @X2()
        mov     rax, rdi
        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 128], ymm0
        vmovups ymmword ptr [rdi + 96], ymm0
        vmovups ymmword ptr [rdi + 64], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0
        mov     qword ptr [rdi + 160], 0
        vzeroupper
        ret

https://godbolt.org/z/9G4dPs5jG

Author: am11
Assignees: -
Labels:

area-CodeGen-coreclr, untriaged

Milestone: -

@jakobbotsch
Copy link
Member

Whether or not we call the helper depends on the size of the struct. It is a heuristic, so this is really about whether or not the heuristic should be adjusted.

else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
if (src->OperIs(GT_IND))
{
ContainBlockStoreAddress(blkNode, size, src->AsIndir()->Addr(), src->AsIndir());
}
ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr);
}
else
{
assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK));
#ifdef TARGET_AMD64
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
#else
// TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
#endif
}

@jakobbotsch jakobbotsch changed the title Vectorize zero initialization of struct with fixed-size arrays Investigate adjusting herustics for unrolled block copies/initialization Feb 23, 2023
@am11
Copy link
Member Author

am11 commented Feb 23, 2023

this is really about whether or not the heuristic should be adjusted.

Another issue was that even without the helper call (C:X1() above), codegen isn't optimal as it can be.

X1():                                 # @X1()
        mov     rax, rdi
        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 56], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret

+ call to fail-fast (if necessary?)

@jakobbotsch
Copy link
Member

jakobbotsch commented Feb 23, 2023

Some of the code is due to the stack cookie that we are storing/checking, which the native compiler doesn't do.

If I change S1 in the native code to be the same size as in managed code (i.e. 88 bytes), I get:

        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 56], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0

So clang is doing an "unaligned" block write instead of our:

       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  xmmword ptr [rsi+40H], xmm0
       mov      qword ptr [rsi+50H], rax

That definitely seems like something we might be able to improve. cc @tannergooding, not sure if there would be any concerns about writes that cross cache lines that we don't already have? (Presumably not, since a struct like this is anyway only 4-byte aligned)

@tannergooding
Copy link
Member

tannergooding commented Feb 23, 2023

Shouldn't be an issue for most of the scenarios we have. Cache line splits tend to be a very small impact and so the two individual stores that would otherwise avoid the split need to be setup just right to be "faster" (or the splits need to be frequent enough to add up as measurable).

Page splits are worse, but also much less frequent.

@am11
Copy link
Member Author

am11 commented Feb 24, 2023

Some of the code is due to the stack cookie that we are storing/checking, which the native compiler doesn't do.

@jakobbotsch, thanks for this pointer. It looks like clang is able to prove that these functions are not vulnerable to stack smashing, so it produces the same result even with -fstack-protector-strong (uses strong heuristics to find vulnerable functions): https://godbolt.org/z/j4Y39nfrK. With -fstack-protector-all (enables protection in all function without analysis), however, it exhibits similar -- still slightly better -- code quality.

@jakobbotsch
Copy link
Member

It looks like clang is able to prove that these functions are not vulnerable to stack smashing, so it produces the same result even with -fstack-protector-strong (uses strong heuristics to find vulnerable functions): https://godbolt.org/z/j4Y39nfrK

Clang still does not seem to produce any stack guards for the following code that is clearly vulnerable to buffer overflows when I specify -fstack-protector-strong:

void foo(char* p);

S1 X1()
{
    S1 s { };
    foo(s.e);
    return s;
}

@jakobbotsch
Copy link
Member

jakobbotsch commented Feb 24, 2023

With that said, we could definitely omit it in some cases, I suppose. Today we just turn the check on whenever we see a struct with UnsafeValueType attribute (which the fixed causes Roslyn to add), or when we see stackalloc.

@JulieLeeMSFT JulieLeeMSFT removed the untriaged New issue has not been triaged by the area owner label Mar 6, 2023
@JulieLeeMSFT JulieLeeMSFT modified the milestones: 8.0.0, Future Mar 6, 2023
@JulieLeeMSFT
Copy link
Member

Setting to Future for now. I don't think we can accomodate it in .NET 8.

@ghost ghost added the in-pr There is an active PR which will close this issue when it is merged label Mar 10, 2023
@ghost ghost removed the in-pr There is an active PR which will close this issue when it is merged label Mar 13, 2023
@am11
Copy link
Member Author

am11 commented Mar 16, 2023

@EgorBo, was it so that the heuristic is adjusted and we should expect better codegen for X2() (without CORINFO_HELP_MEMSET etc.) as it does for X1()? I couldn't find any difference with 1936536 (crossgen2 --optimize --instruction-set avx512dq ...)

@am11 am11 reopened this Mar 16, 2023
@am11
Copy link
Member Author

am11 commented Mar 22, 2023

With #83638, memset is inlined for int16 case as well:

C:X2():S2:this:
       sub      rsp, 24
       vzeroupper 
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+10H], rax
       xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  ymmword ptr[rsi+40H], ymm0
       vmovdqu  ymmword ptr[rsi+60H], ymm0
       vmovdqu  ymmword ptr[rsi+80H], ymm0
       mov      qword ptr [rsi+A0H], rax
       mov      rax, rsi
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+10H], rdi
       je       SHORT G_M46095_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M46095_IG03:
       nop      
       add      rsp, 24
       ret      

C:.ctor():this:
       ret      

Thanks @EgorBo! :)

@am11 am11 closed this as completed Mar 22, 2023
@ghost ghost locked as resolved and limited conversation to collaborators Apr 21, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants