JIT: Have lowering set up IR for post-indexed addressing and make strength reduced IV updates amenable to post-indexed addressing #105185

jakobbotsch · 2024-07-20T17:04:43Z

This adds a transformation in lowering that tries to set up the IR to be
amenable to post-indexed addressing in the backend. It does so by
looking for RMW additions/subtractions of a local that was also recently
used as the address to an indirection, and making them adjacent.

Additionally, have strength reduction try to insert IV updates after the last
use if that last use is a legal insertion point. This allows the lowering transformation
to kick in.

For a simple loop:

[MethodImpl(MethodImplOptions.NoInlining)]
public static int Sum(int[] arr)
{
    int sum = 0;
    foreach (int x in arr)
    {
        sum += x;
    }

    return sum;
}

this results in:

@@ -19,12 +19,11 @@ G_M53154_IG03:        ; bbWeight=0.25, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, b
 						;; size=4 bbWeight=0.25 PerfScore 0.12
 
 G_M53154_IG04:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0001 {x0}, byref, isz
-            ldr     w3, [x0]
+            ldr     w3, [x0], #0x04
             add     w1, w3, w1
-            add     x0, x0, #4
             sub     w2, w2, #1
             cbnz    w2, G_M53154_IG04
-						;; size=20 bbWeight=4 PerfScore 22.00
+						;; size=16 bbWeight=4 PerfScore 20.00
 
 G_M53154_IG05:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
                              ; byrRegs -[x0]
@@ -35,5 +34,5 @@ G_M53154_IG06:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-; Total bytes of code: 60
+; Total bytes of code: 56

The .NET 8 vs .NET 9 codegen diff for this loop becomes:

@@ -7,11 +7,10 @@ G_M53154_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 G_M53154_IG02:        ; bbWeight=1, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, byref, isz
                              ; gcrRegs +[x0]
             mov     w1, wzr
-            mov     w2, wzr
-            ldr     w3, [x0, #0x08]
-            cmp     w3, #0
+            ldr     w2, [x0, #0x08]
+            cmp     w2, #0
             ble     G_M53154_IG05
-						;; size=20 bbWeight=1 PerfScore 5.50
+						;; size=16 bbWeight=1 PerfScore 5.00
 
 G_M53154_IG03:        ; bbWeight=0.25, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, byref
             add     x0, x0, #16
@@ -20,12 +19,11 @@ G_M53154_IG03:        ; bbWeight=0.25, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, b
 						;; size=4 bbWeight=0.25 PerfScore 0.12
 
 G_M53154_IG04:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0001 {x0}, byref, isz
-            ldr     w4, [x0, w2, UXTW #2]
-            add     w1, w4, w1
-            add     w2, w2, #1
-            cmp     w3, w2
-            bgt     G_M53154_IG04
-						;; size=20 bbWeight=4 PerfScore 22.00
+            ldr     w3, [x0], #0x04
+            add     w1, w3, w1
+            sub     w2, w2, #1
+            cbnz    w2, G_M53154_IG04
+						;; size=16 bbWeight=4 PerfScore 20.00
 
 G_M53154_IG05:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
                              ; byrRegs -[x0]
@@ -36,5 +34,5 @@ G_M53154_IG06:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-; Total bytes of code: 64
+; Total bytes of code: 56

dotnet-policy-service · 2024-07-20T17:05:24Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

jakobbotsch · 2024-07-20T19:52:10Z

/azp run runtime-coreclr jitstress, runtime-coreclr libraries-jitstress

azure-pipelines · 2024-07-20T19:52:30Z

Azure Pipelines successfully started running 2 pipeline(s).

jakobbotsch · 2024-07-21T08:02:32Z

Ran jitstress in https://dev.azure.com/dnceng-public/public/_build/results?buildId=749010&view=results and libraries-jitstress in https://dev.azure.com/dnceng-public/public/_build/results?buildId=749011&view=results.

jitstress failures were #105186.

libraries-jitstress failures were #105092, #105189, #102370.

jakobbotsch · 2024-07-21T10:46:15Z

cc @dotnet/jit-contrib PTAL @AndyAyersMS

Diffs. Some cool diffs:

AndyAyersMS · 2024-07-21T17:16:01Z

Interesting diff in windows arm benchmarks pgo

+4 (+0.09%) : 7422.dasm - System.Text.RegularExpressions.RegexPrefixAnalyzer:<FindPrefixes>g__FindPrefixesCore|0_1(System.Text.RegularExpressions.RegexNode,System.Collections.Generic.List`1[System.Text.StringBuilder],ubyte):ubyte (Tier0-FullOpts)
@@ -1600,10 +1600,12 @@ G_M12455_IG128:        ; bbWeight=0.05, gcrefRegs=180000 {x19 x20}, byrefRegs=C0
 						;; size=4 bbWeight=0.05 PerfScore 0.02
 G_M12455_IG129:        ; bbWeight=0.49, gcrefRegs=180000 {x19 x20}, byrefRegs=C00000 {x22 x23}, byref, isz
             ldrh    w1, [x23, x21]
-            stp     wzr, w1, [fp, #0x50]	// [V16 loc13], [V15 loc12]
+            str     w1, [fp, #0x54]	// [V15 loc12]
+            add     x21, x21, #2
+            str     wzr, [fp, #0x50]	// [V16 loc13]

AndyAyersMS · 2024-07-21T16:50:53Z

src/coreclr/jit/lowerarmarch.cpp

+    int maxCount = min(m_blockIndirs.Height(), POST_INDEXED_ADDRESSING_MAX_DISTANCE / 2);
+    for (int i = 0; i < maxCount; i++)
+    {
+        SavedIndir& prev = m_blockIndirs.TopRef(i);


Would it be more efficient to start checking with the last indir instead of the first?

This does start with the last indir (since it is using TopRef instead of BottomRef)

src/coreclr/jit/lowerarmarch.cpp

AndyAyersMS · 2024-07-21T17:02:23Z

src/coreclr/jit/lowerarmarch.cpp

+    assert((prevIndir->gtLIRFlags & LIR::Flags::Mark) == 0);
+    m_scratchSideEffects.Clear();
+
+    for (GenTree* cur = prevIndir->gtNext; cur != store; cur = cur->gtNext)


I wonder if this could be cheaper if you computed two side effect sets and then checked for interference. But it probably doesn't make much difference.

Hmm, possibly -- although it would be a bit less precise than what's here since not all nodes that are part of store's dataflow necessarily happen after all the nodes we are moving.

This adds a transformation in lowering that tries to set up the IR to be amenable to post-indexed addressing in the backend. It does so by looking for RMW additions/subtractions of a local that was also recently used as the address to an indirection.

…sing On arm64 have strength reduction try to insert IV updates after the last use if that last use is a legal insertion point. This often allows the backend to use post-indexed addressing to combine the use with the IV update.

jakobbotsch · 2024-07-21T17:49:52Z

Interesting diff in windows arm benchmarks pgo

+4 (+0.09%) : 7422.dasm - System.Text.RegularExpressions.RegexPrefixAnalyzer:<FindPrefixes>g__FindPrefixesCore|0_1(System.Text.RegularExpressions.RegexNode,System.Collections.Generic.List`1[System.Text.StringBuilder],ubyte):ubyte (Tier0-FullOpts)
@@ -1600,10 +1600,12 @@ G_M12455_IG128:        ; bbWeight=0.05, gcrefRegs=180000 {x19 x20}, byrefRegs=C0
 						;; size=4 bbWeight=0.05 PerfScore 0.02
 G_M12455_IG129:        ; bbWeight=0.49, gcrefRegs=180000 {x19 x20}, byrefRegs=C00000 {x22 x23}, byref, isz
             ldrh    w1, [x23, x21]
-            stp     wzr, w1, [fp, #0x50]	// [V16 loc13], [V15 loc12]
+            str     w1, [fp, #0x54]	// [V15 loc12]
+            add     x21, x21, #2
+            str     wzr, [fp, #0x50]	// [V16 loc13]

We end up with this IR after strength reduction:

***** BB135 [0056]
STMT00207 ( 0x2B1[E-] ... 0x2BB )
N007 ( 10,  7) [000666] DA-XGO-----                         ▌  STORE_LCL_VAR int    V15 loc12        d:1 $d1a
N006 ( 10,  7) [000664] ---XGO-N---                         └──▌  COMMA     ushort <l:$b0d, c:$b0c>
N001 (  0,  0) [000657] -----------                            ├──▌  NOP       void  
N005 ( 10,  7) [003041] ---XGO-----                            └──▌  IND       ushort <l:$b0a, c:$b0b>
N004 (  7,  5) [000663] ----GO-N---                               └──▌  ADD       byref  $c12
N002 (  3,  2) [000662] -----------                                  ├──▌  LCL_VAR   byref  V165 tmp129      u:2 $c11
N003 (  3,  2) [003994] -----------                                  └──▌  LCL_VAR   long   V247 rat2        

***** BB135 [0056]
STMT00720 ( ??? ... ??? )
N004 (  9,  8) [003993] DA---------                         ▌  STORE_LCL_VAR long   V247 rat2        
N003 (  5,  5) [003992] -----------                         └──▌  ADD       long  
N001 (  3,  2) [003991] -----------                            ├──▌  LCL_VAR   long   V247 rat2        
N002 (  1,  2) [003989] -----------                            └──▌  CNS_INT   long   2 $34d

***** BB135 [0056]
STMT00208 ( 0x2BD[E-] ... 0x2BE )
N002 (  1,  3) [000668] DA---------                         ▌  STORE_LCL_VAR int    V16 loc13        d:1 $VN.Void
N001 (  1,  2) [000667] -----------                         └──▌  CNS_INT   int    0 $c0

Lowering does not try to make the indirection [003041 and update [003993] adjacent since the indirection isn't directly on V247, so there won't be a chance for post-indexed addressing. And this breaks the stp that previously happened for V15 and V16 that end up being stack allocated in adjacent slots.

Strength reduction doesn't do much (any) sanity checking of whether we actually expect to be able to do post-indexed after moving the IV update. That would require us to check that the use is of a supported pattern. But I figure that complication is unnecessary since the exact place we update the IV at shouldn't matter much here -- it is live throughout the loop anyway. It might even be better for scheduling purposes to update it as soon as possible after that last use.

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jul 20, 2024

dotnet-policy-service bot assigned jakobbotsch Jul 20, 2024

jakobbotsch mentioned this pull request Jul 20, 2024

JIT/opt/ValueNumbering/ExposedLocalsNumbering/ExposedLocalsNumbering failing in CI #105187

Open

jakobbotsch force-pushed the lower-post-indexing branch from e907ed1 to 9598fdd Compare July 21, 2024 07:59

jakobbotsch mentioned this pull request Jul 21, 2024

JIT: Missing support for pre-indexed addressing on arm64 #105193

Open

jakobbotsch marked this pull request as ready for review July 21, 2024 10:42

jakobbotsch requested a review from AndyAyersMS July 21, 2024 10:46

AndyAyersMS reviewed Jul 21, 2024

View reviewed changes

jakobbotsch added 2 commits July 21, 2024 19:35

jakobbotsch force-pushed the lower-post-indexing branch from 9598fdd to c5cc900 Compare July 21, 2024 17:35

AndyAyersMS approved these changes Jul 21, 2024

View reviewed changes

jakobbotsch merged commit 7dd68f4 into dotnet:main Jul 22, 2024
102 of 108 checks passed

jakobbotsch deleted the lower-post-indexing branch July 22, 2024 09:06

LoopedBard3 mentioned this pull request Jul 25, 2024

[Perf] Linux/arm64: 4 Improvements on 7/22/2024 3:17:40 PM dotnet/perf-autofiling-issues#38912

Closed

github-actions bot locked and limited conversation to collaborators Aug 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT: Have lowering set up IR for post-indexed addressing and make strength reduced IV updates amenable to post-indexed addressing #105185

JIT: Have lowering set up IR for post-indexed addressing and make strength reduced IV updates amenable to post-indexed addressing #105185

jakobbotsch commented Jul 20, 2024 •

edited

Loading

dotnet-policy-service bot commented Jul 20, 2024

jakobbotsch commented Jul 20, 2024

azure-pipelines bot commented Jul 20, 2024

jakobbotsch commented Jul 21, 2024

jakobbotsch commented Jul 21, 2024

AndyAyersMS commented Jul 21, 2024

AndyAyersMS Jul 21, 2024

jakobbotsch Jul 21, 2024

AndyAyersMS Jul 21, 2024

jakobbotsch Jul 21, 2024 •

edited

Loading

jakobbotsch commented Jul 21, 2024 •

edited

Loading

JIT: Have lowering set up IR for post-indexed addressing and make strength reduced IV updates amenable to post-indexed addressing #105185

JIT: Have lowering set up IR for post-indexed addressing and make strength reduced IV updates amenable to post-indexed addressing #105185

Conversation

jakobbotsch commented Jul 20, 2024 • edited Loading

dotnet-policy-service bot commented Jul 20, 2024

jakobbotsch commented Jul 20, 2024

azure-pipelines bot commented Jul 20, 2024

jakobbotsch commented Jul 21, 2024

jakobbotsch commented Jul 21, 2024

AndyAyersMS commented Jul 21, 2024

AndyAyersMS Jul 21, 2024

Choose a reason for hiding this comment

jakobbotsch Jul 21, 2024

Choose a reason for hiding this comment

AndyAyersMS Jul 21, 2024

Choose a reason for hiding this comment

jakobbotsch Jul 21, 2024 • edited Loading

Choose a reason for hiding this comment

jakobbotsch commented Jul 21, 2024 • edited Loading

jakobbotsch commented Jul 20, 2024 •

edited

Loading

jakobbotsch Jul 21, 2024 •

edited

Loading

jakobbotsch commented Jul 21, 2024 •

edited

Loading