Investigate adjusting herustics for unrolled block copies/initialization #82529

am11 · 2023-02-23T15:36:41Z

Zero initialization becomes expensive with two bytes struct fields are involved:

Given:

unsafe struct S1
{
    fixed byte a[10];
    int b;
    fixed byte c[23];
    fixed byte d[24];
    fixed byte e[25];
}

unsafe struct S2
{
    fixed short a[10];
    int b;
    fixed short c[23];
    fixed short d[24];
    fixed short e[25];
}

Unlike S1, initializing S2 does not inline the CORINFO_HELP_MEMSET call and fail to make use of vectors:

;   S1 X1() { S1 s = default; return s; }

C:X1():S1:this:
       sub      rsp, 24
       vzeroupper 
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+10H], rax
       xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  xmmword ptr [rsi+40H], xmm0
       mov      qword ptr [rsi+50H], rax
       mov      rax, rsi
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+10H], rdi
       je       SHORT G_M61359_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M61359_IG03:
       nop      
       add      rsp, 24
       ret      

;   S2 X2() { S2 s = default; return s; }

C:X2():S2:this:
       push     rbx
       sub      rsp, 16
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+08H], rax
       mov      rbx, rsi
       xor      esi, esi
       mov      rdi, rbx
       mov      edx, 168
       call     [CORINFO_HELP_MEMSET]
       mov      rax, rbx
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+08H], rdi
       je       SHORT G_M46095_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M46095_IG03:
       nop      
       add      rsp, 16
       pop      rbx
       ret

The codegen can try to match that of C++ compiler:

X2():                                  # @X2()
        mov     rax, rdi
        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 128], ymm0
        vmovups ymmword ptr [rdi + 96], ymm0
        vmovups ymmword ptr [rdi + 64], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0
        mov     qword ptr [rdi + 160], 0
        vzeroupper
        ret

https://godbolt.org/z/a3WsPdd4M

The text was updated successfully, but these errors were encountered:

ghost · 2023-02-23T15:37:01Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak
See info in area-owners.md if you want to be subscribed.

Issue Details

Zero initialization becomes expensive with two bytes struct fields are involved:

Given:

unsafe struct S1
{
    fixed byte a[10];
    int b;
    fixed byte c[23];
    fixed byte d[24];
    fixed byte e[25];
}

unsafe struct S2
{
    fixed short a[10];
    int b;
    fixed short c[23];
    fixed short d[24];
    fixed short e[25];
}

Unlike S1, initializing S2 does not inline the CORINFO_HELP_MEMSET call and fail to make use of vectors:

;   S1 X1() { S1 s = default; return s; }

C:X1():S1:this:
       sub      rsp, 24
       vzeroupper 
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+10H], rax
       xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  xmmword ptr [rsi+40H], xmm0
       mov      qword ptr [rsi+50H], rax
       mov      rax, rsi
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+10H], rdi
       je       SHORT G_M61359_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M61359_IG03:
       nop      
       add      rsp, 24
       ret      

;   S2 X2() { S2 s = default; return s; }

C:X2():S2:this:
       push     rbx
       sub      rsp, 16
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+08H], rax
       mov      rbx, rsi
       xor      esi, esi
       mov      rdi, rbx
       mov      edx, 168
       call     [CORINFO_HELP_MEMSET]
       mov      rax, rbx
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+08H], rdi
       je       SHORT G_M46095_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M46095_IG03:
       nop      
       add      rsp, 16
       pop      rbx
       ret

The codegen can try to match that of C++ compiler:

X2():                                  # @X2()
        mov     rax, rdi
        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 128], ymm0
        vmovups ymmword ptr [rdi + 96], ymm0
        vmovups ymmword ptr [rdi + 64], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0
        mov     qword ptr [rdi + 160], 0
        vzeroupper
        ret

https://godbolt.org/z/9G4dPs5jG

Author:	am11
Assignees:	-
Labels:	`area-CodeGen-coreclr`, `untriaged`
Milestone:	-

jakobbotsch · 2023-02-23T15:43:53Z

Whether or not we call the helper depends on the size of the struct. It is a heuristic, so this is really about whether or not the heuristic should be adjusted.

runtime/src/coreclr/jit/lowerxarch.cpp

Lines 475 to 496 in 3bc4f0e

    
                   else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT)) 
        
                   { 
        
                       blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; 
        
                       if (src->OperIs(GT_IND)) 
        
                       { 
        
                           ContainBlockStoreAddress(blkNode, size, src->AsIndir()->Addr(), src->AsIndir()); 
        
                       } 
        
                       ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); 
        
                   } 
        
                   else 
        
                   { 
        
                       assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK)); 
        
           #ifdef TARGET_AMD64 
        
                       blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; 
        
           #else 
        
                       // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86 
        
                       blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr; 
        
           #endif 
        
                   }

am11 · 2023-02-23T17:40:23Z

this is really about whether or not the heuristic should be adjusted.

Another issue was that even without the helper call (C:X1() above), codegen isn't optimal as it can be.

X1():                                 # @X1()
        mov     rax, rdi
        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 56], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret

+ call to fail-fast (if necessary?)

jakobbotsch · 2023-02-23T18:35:51Z

Some of the code is due to the stack cookie that we are storing/checking, which the native compiler doesn't do.

If I change S1 in the native code to be the same size as in managed code (i.e. 88 bytes), I get:

        vxorps  xmm0, xmm0, xmm0
        vmovups ymmword ptr [rdi + 56], ymm0
        vmovups ymmword ptr [rdi + 32], ymm0
        vmovups ymmword ptr [rdi], ymm0

So clang is doing an "unaligned" block write instead of our:

       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  xmmword ptr [rsi+40H], xmm0
       mov      qword ptr [rsi+50H], rax

That definitely seems like something we might be able to improve. cc @tannergooding, not sure if there would be any concerns about writes that cross cache lines that we don't already have? (Presumably not, since a struct like this is anyway only 4-byte aligned)

tannergooding · 2023-02-23T18:39:43Z

Shouldn't be an issue for most of the scenarios we have. Cache line splits tend to be a very small impact and so the two individual stores that would otherwise avoid the split need to be setup just right to be "faster" (or the splits need to be frequent enough to add up as measurable).

Page splits are worse, but also much less frequent.

am11 · 2023-02-24T02:00:10Z

Some of the code is due to the stack cookie that we are storing/checking, which the native compiler doesn't do.

@jakobbotsch, thanks for this pointer. It looks like clang is able to prove that these functions are not vulnerable to stack smashing, so it produces the same result even with -fstack-protector-strong (uses strong heuristics to find vulnerable functions): https://godbolt.org/z/j4Y39nfrK. With -fstack-protector-all (enables protection in all function without analysis), however, it exhibits similar -- still slightly better -- code quality.

jakobbotsch · 2023-02-24T11:06:10Z

It looks like clang is able to prove that these functions are not vulnerable to stack smashing, so it produces the same result even with -fstack-protector-strong (uses strong heuristics to find vulnerable functions): https://godbolt.org/z/j4Y39nfrK

Clang still does not seem to produce any stack guards for the following code that is clearly vulnerable to buffer overflows when I specify -fstack-protector-strong:

void foo(char* p);

S1 X1()
{
    S1 s { };
    foo(s.e);
    return s;
}

jakobbotsch · 2023-02-24T11:10:52Z

With that said, we could definitely omit it in some cases, I suppose. Today we just turn the check on whenever we see a struct with UnsafeValueType attribute (which the fixed causes Roslyn to add), or when we see stackalloc.

JulieLeeMSFT · 2023-03-06T18:51:26Z

Setting to Future for now. I don't think we can accomodate it in .NET 8.

am11 · 2023-03-16T16:33:27Z

@EgorBo, was it so that the heuristic is adjusted and we should expect better codegen for X2() (without CORINFO_HELP_MEMSET etc.) as it does for X1()? I couldn't find any difference with 1936536 (crossgen2 --optimize --instruction-set avx512dq ...)

am11 · 2023-03-22T08:51:40Z

With #83638, memset is inlined for int16 case as well:

C:X2():S2:this:
       sub      rsp, 24
       vzeroupper 
       mov      rax, qword ptr [(reloc)]
       mov      qword ptr [rsp+10H], rax
       xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rsi], ymm0
       vmovdqu  ymmword ptr[rsi+20H], ymm0
       vmovdqu  ymmword ptr[rsi+40H], ymm0
       vmovdqu  ymmword ptr[rsi+60H], ymm0
       vmovdqu  ymmword ptr[rsi+80H], ymm0
       mov      qword ptr [rsi+A0H], rax
       mov      rax, rsi
       lea      rdi, [(reloc)]
       mov      rdi, qword ptr [rdi]
       cmp      qword ptr [rsp+10H], rdi
       je       SHORT G_M46095_IG03
       call     [CORINFO_HELP_FAIL_FAST]
G_M46095_IG03:
       nop      
       add      rsp, 24
       ret      

C:.ctor():this:
       ret

Thanks @EgorBo! :)

ghost added the untriaged New issue has not been triaged by the area owner label Feb 23, 2023

am11 added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 23, 2023

jakobbotsch changed the title ~~Vectorize zero initialization of struct with fixed-size arrays~~ Investigate adjusting herustics for unrolled block copies/initialization Feb 23, 2023

JulieLeeMSFT assigned jakobbotsch Mar 6, 2023

JulieLeeMSFT removed the untriaged New issue has not been triaged by the area owner label Mar 6, 2023

JulieLeeMSFT modified the milestones: 8.0.0, Future Mar 6, 2023

jakobbotsch mentioned this issue Mar 10, 2023

Unify unroll limits in a single entry point #83274

Merged

ghost added the in-pr There is an active PR which will close this issue when it is merged label Mar 10, 2023

EgorBo closed this as completed in #83274 Mar 13, 2023

ghost removed the in-pr There is an active PR which will close this issue when it is merged label Mar 13, 2023

am11 reopened this Mar 16, 2023

am11 closed this as completed Mar 22, 2023

ghost locked as resolved and limited conversation to collaborators Apr 21, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Investigate adjusting herustics for unrolled block copies/initialization #82529

Investigate adjusting herustics for unrolled block copies/initialization #82529

am11 commented Feb 23, 2023 •

edited

Loading

ghost commented Feb 23, 2023

jakobbotsch commented Feb 23, 2023

am11 commented Feb 23, 2023 •

edited

Loading

jakobbotsch commented Feb 23, 2023 •

edited

Loading

tannergooding commented Feb 23, 2023 •

edited

Loading

am11 commented Feb 24, 2023

jakobbotsch commented Feb 24, 2023

jakobbotsch commented Feb 24, 2023 •

edited

Loading

JulieLeeMSFT commented Mar 6, 2023

am11 commented Mar 16, 2023

am11 commented Mar 22, 2023

Investigate adjusting herustics for unrolled block copies/initialization #82529

Investigate adjusting herustics for unrolled block copies/initialization #82529

Comments

am11 commented Feb 23, 2023 • edited Loading

ghost commented Feb 23, 2023

jakobbotsch commented Feb 23, 2023

am11 commented Feb 23, 2023 • edited Loading

jakobbotsch commented Feb 23, 2023 • edited Loading

tannergooding commented Feb 23, 2023 • edited Loading

am11 commented Feb 24, 2023

jakobbotsch commented Feb 24, 2023

jakobbotsch commented Feb 24, 2023 • edited Loading

JulieLeeMSFT commented Mar 6, 2023

am11 commented Mar 16, 2023

am11 commented Mar 22, 2023

am11 commented Feb 23, 2023 •

edited

Loading

am11 commented Feb 23, 2023 •

edited

Loading

jakobbotsch commented Feb 23, 2023 •

edited

Loading

tannergooding commented Feb 23, 2023 •

edited

Loading

jakobbotsch commented Feb 24, 2023 •

edited

Loading