JIT: Clone all local addresses when importing `dup` #72714

jakobbotsch · 2022-07-23T09:35:51Z

Roslyn emits dup for the field address when compound assignment
operators are used on struct fields. We would previously spill this
address leading us to mark such structs as address exposed and disabling
promotion.

Also allow removing unnecessary casts in cases like

ASG
  LCL_FLD ubyte V00
  CAST int <- ubyte <- int
    ...

we only allowed this cast removal for LCL_VAR and IND before, which led
to unnecessary new casts in some cases with this change.

Roslyn emits `dup` for the field address when compound assignment operators are used on struct fields. We would previously spill this address leading us to mark such structs as address exposed and disabling promotion. Also allow removing unnecessary casts in cases like ``` ASG LCL_FLD ubyte V00 CAST int <- ubyte <- int ... ``` we only allowed this cast removal for LCL_VAR and IND before, which led to unnecessary new casts in some cases with this change.

ghost · 2022-07-23T09:36:07Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Roslyn emits dup for the field address when compound assignment
operators are used on struct fields. We would previously spill this
address leading us to mark such structs as address exposed and disabling
promotion.

Also allow removing unnecessary casts in cases like

ASG
  LCL_FLD ubyte V00
  CAST int <- ubyte <- int
    ...

we only allowed this cast removal for LCL_VAR and IND before, which led
to unnecessary new casts in some cases with this change.

Author:	jakobbotsch
Assignees:	-
Labels:	`area-CodeGen-coreclr`
Milestone:	-

jakobbotsch · 2022-07-23T09:39:04Z

Example codegen diff for

[MethodImpl(MethodImplOptions.NoInlining)]
static int Test(int its)
{
    Foo f = new Foo();
    for (int i = 0; i < its; i++)
    {
        f.A += f.B * i;
        f.B = i;
    }

    return f.A + f.B;
}

struct Foo
{
    public int A, B;
}

is

 ; No PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T02] (  4,  7   )     int  ->  rcx         single-def
-;  V01 loc0         [V01    ] (  5, 14   )  struct ( 8) [rsp+00H]   do-not-enreg[XS] must-init addr-exposed ld-addr-op
-;  V02 loc1         [V02,T01] (  6, 21   )     int  ->  rax        
+;  V00 arg0         [V00,T03] (  4,  7   )     int  ->  rcx         single-def
+;* V01 loc0         [V01    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op
+;  V02 loc1         [V02,T00] (  5, 17   )     int  ->   r8        
 ;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [rsp+00H]   "OutgoingArgSpace"
-;  V04 tmp1         [V04,T00] (  3, 24   )   byref  ->  rdx         "dup spill"
-;  V05 tmp2         [V05    ] (  2,  5   )     int  ->  [rsp+00H]   do-not-enreg[X] addr-exposed V01.A(offs=0x00) P-DEP "field V01.A (fldOffset=0x0)"
-;  V06 tmp3         [V06    ] (  3,  9   )     int  ->  [rsp+04H]   do-not-enreg[X] addr-exposed V01.B(offs=0x04) P-DEP "field V01.B (fldOffset=0x4)"
+;  V04 tmp1         [V04,T02] (  4, 10   )     int  ->  rax         V01.A(offs=0x00) P-INDEP "field V01.A (fldOffset=0x0)"
+;  V05 tmp2         [V05,T01] (  5, 14   )     int  ->  rdx         V01.B(offs=0x04) P-INDEP "field V01.B (fldOffset=0x4)"
 ;
-; Lcl frame size = 8
+; Lcl frame size = 0
 
 G_M46009_IG01:
-       push     rax
-       xor      eax, eax
-       mov      qword ptr [rsp], rax
-						;; size=7 bbWeight=1    PerfScore 2.25
+						;; size=0 bbWeight=1    PerfScore 0.00
 G_M46009_IG02:
        xor      eax, eax
+       xor      edx, edx
+       xor      r8d, r8d
        test     ecx, ecx
        jle      SHORT G_M46009_IG04
-       align    [3 bytes for IG03]
-						;; size=9 bbWeight=1    PerfScore 1.75
+       align    [0 bytes for IG03]
+						;; size=11 bbWeight=1    PerfScore 2.00
 G_M46009_IG03:
-       lea      rdx, bword ptr [rsp]
-       mov      r8d, eax
-       imul     r8d, dword ptr [rsp+04H]
-       add      dword ptr [rdx], r8d
-       mov      dword ptr [rsp+04H], eax
-       inc      eax
-       cmp      eax, ecx
+       imul     edx, r8d
+       add      eax, edx
+       mov      edx, r8d
+       lea      r8d, [rdx+1]
+       cmp      r8d, ecx
        jl       SHORT G_M46009_IG03
-						;; size=26 bbWeight=4    PerfScore 41.00
+						;; size=18 bbWeight=4    PerfScore 17.00
 G_M46009_IG04:
-       mov      eax, dword ptr [rsp]
-       add      eax, dword ptr [rsp+04H]
-						;; size=7 bbWeight=1    PerfScore 3.00
+       add      eax, edx
+						;; size=2 bbWeight=1    PerfScore 0.25
 G_M46009_IG05:
-       add      rsp, 8
        ret      
-						;; size=5 bbWeight=1    PerfScore 1.25
+						;; size=1 bbWeight=1    PerfScore 1.00

jakobbotsch · 2022-08-24T10:55:25Z

cc @dotnet/jit-contrib PTAL @EgorBo

Diffs

Spot checking the regressions it seems to be mostly because we do not support RMW ops for LCL_FLD that we managed to convert to RMW ops before when we created a temp for the address:

 G_M33991_IG05:        ; gcrefRegs=00000000 {}, byrefRegs=00000001 {rax}, byref
        mov      byte  ptr [rax], dl
-       lea      rax, bword ptr [rsp]
-       or       byte  ptr [rax], 64
        mov      eax, dword ptr [rsp]
        ; byrRegs -[rax]
-                                               ;; size=12 bbWeight=1    PerfScore 5.50
+       movzx    rax, al
+       or       eax, 64
+       mov      byte  ptr [rsp], al
+       mov      eax, dword ptr [rsp]
+                                               ;; size=17 bbWeight=1    PerfScore 4.50
 G_M33991_IG06:        ; , epilog, nogc, extend
        add      rsp, 8
        ret

This seems to happen rarely so I think we can wait and see if there are any significant regressions in the micro benchmarks.

There are a lot of asp.net improvements due to changing the compDbgCode check to OptimizationEnabled() instead, causing the JIT to leave the cloning/temp decision up to impCloneExpr instead for the tier-0 compilations there.

src/coreclr/jit/importer.cpp

DrewScoggins · 2022-08-30T16:47:41Z

Improvements on
win-x86: dotnet/perf-autofiling-issues#8174
win-x64 (AMD): dotnet/perf-autofiling-issues#8138

AndyAyersMS · 2022-09-01T16:44:03Z

Possible regressions:

[Perf] ubuntu 18.04/arm64 : Regression on 8/25/2022 4:02:22 AM perf-autofiling-issues#8222

Possible improvements:

[Perf] Windows 10.0.25094/arm64 : Improvement on 8/29/2022 8:26:03 AM perf-autofiling-issues#8212
[Perf] Windows 10.0.19041/arm64 : Improvement on 8/29/2022 11:36:03 AM perf-autofiling-issues#8235

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jul 23, 2022

ghost assigned jakobbotsch Jul 23, 2022

jakobbotsch added this to the 8.0.0 milestone Jul 29, 2022

jakobbotsch marked this pull request as ready for review August 17, 2022 20:30

jakobbotsch closed this Aug 17, 2022

jakobbotsch reopened this Aug 17, 2022

jakobbotsch closed this Aug 23, 2022

jakobbotsch reopened this Aug 23, 2022

SingleAccretion reviewed Aug 24, 2022

View reviewed changes

src/coreclr/jit/importer.cpp Show resolved Hide resolved

jakobbotsch mentioned this pull request Aug 25, 2022

JIT: Reconsider setting GTF_GLOB_REF in gtNewFieldRef #74563

Closed

jakobbotsch requested a review from EgorBo August 25, 2022 09:34

AndyAyersMS approved these changes Aug 26, 2022

View reviewed changes

jakobbotsch merged commit 8c4e6e2 into dotnet:main Aug 29, 2022

jakobbotsch deleted the clone-locations branch August 29, 2022 10:18

ghost locked as resolved and limited conversation to collaborators Oct 1, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT: Clone all local addresses when importing `dup` #72714

JIT: Clone all local addresses when importing `dup` #72714

jakobbotsch commented Jul 23, 2022

ghost commented Jul 23, 2022

jakobbotsch commented Jul 23, 2022 •

edited

Loading

jakobbotsch commented Aug 24, 2022

DrewScoggins commented Aug 30, 2022 •

edited

Loading

AndyAyersMS commented Sep 1, 2022 •

edited

Loading

JIT: Clone all local addresses when importing dup #72714

JIT: Clone all local addresses when importing dup #72714

Conversation

jakobbotsch commented Jul 23, 2022

ghost commented Jul 23, 2022

jakobbotsch commented Jul 23, 2022 • edited Loading

jakobbotsch commented Aug 24, 2022

DrewScoggins commented Aug 30, 2022 • edited Loading

AndyAyersMS commented Sep 1, 2022 • edited Loading

JIT: Clone all local addresses when importing `dup` #72714

JIT: Clone all local addresses when importing `dup` #72714

jakobbotsch commented Jul 23, 2022 •

edited

Loading

DrewScoggins commented Aug 30, 2022 •

edited

Loading

AndyAyersMS commented Sep 1, 2022 •

edited

Loading