Skip to content

Commit a930ef5

Browse files
authored
JIT: Reorder stores to make them amenable to stp optimization (dotnet#102133)
This generalizes the indir reordering optimization (that currently only triggers for loads) to kick in for GT_STOREIND nodes. The main complication with doing this is the fact that the data node of the second indirection needs its own reordering with the previous indirection. The existing logic works by reordering all nodes between the first and second indirection that are unrelated to the second indirection's computation to happen after it. Once that is done we know that there are no uses of the first indirection's result between it and the second indirection, so after doing the necessary interference checks we can safely move the previous indirection to happen after the data node of the second indirection. Example: ```csharp class Body { public double x, y, z, vx, vy, vz, mass; } static void Advance(double dt, Body[] bodies) { foreach (Body b in bodies) { b.x += dt * b.vx; b.y += dt * b.vy; b.z += dt * b.vz; } } ``` Diff: ```diff @@ -1,18 +1,17 @@ -G_M55007_IG04: ;; offset=0x001C +G_M55007_IG04: ;; offset=0x0020 ldr x3, [x0, w1, UXTW #3] ldp d16, d17, [x3, #0x08] ldp d18, d19, [x3, #0x20] fmul d18, d0, d18 fadd d16, d16, d18 - str d16, [x3, #0x08] - fmul d16, d0, d19 - fadd d16, d17, d16 - str d16, [x3, #0x10] + fmul d18, d0, d19 + fadd d17, d17, d18 + stp d16, d17, [x3, #0x08] ldr d16, [x3, #0x18] ldr d17, [x3, #0x30] fmul d17, d0, d17 fadd d16, d16, d17 str d16, [x3, #0x18] add w1, w1, #1 cmp w2, w1 bgt G_M55007_IG04 ```
1 parent f0ee416 commit a930ef5

File tree

6 files changed

+137
-41
lines changed

6 files changed

+137
-41
lines changed

src/coreclr/jit/lower.cpp

+108-26
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
414414
}
415415

416416
case GT_STOREIND:
417-
LowerStoreIndirCommon(node->AsStoreInd());
418-
break;
417+
return LowerStoreIndirCommon(node->AsStoreInd());
419418

420419
case GT_ADD:
421420
{
@@ -8895,7 +8894,10 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
88958894
// Arguments:
88968895
// ind - the store indirection node we are lowering.
88978896
//
8898-
void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
8897+
// Return Value:
8898+
// Next node to lower.
8899+
//
8900+
GenTree* Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
88998901
{
89008902
assert(ind->TypeGet() != TYP_STRUCT);
89018903

@@ -8910,28 +8912,30 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
89108912
#endif
89118913
TryCreateAddrMode(ind->Addr(), isContainable, ind);
89128914

8913-
if (!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
8915+
if (comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind))
89148916
{
8917+
return ind->gtNext;
8918+
}
8919+
89158920
#ifndef TARGET_XARCH
8916-
if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
8921+
if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL))
8922+
{
8923+
const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
8924+
if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
89178925
{
8918-
const ssize_t handle = ind->Data()->AsIntCon()->IconValue();
8919-
if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast<CORINFO_OBJECT_HANDLE>(handle)))
8920-
{
8921-
// On platforms with weaker memory model we need to make sure we use a store with the release semantic
8922-
// when we publish a potentially mutable object
8923-
// See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
8924-
// https://github.com/dotnet/runtime/pull/76112#discussion_r980639782
8926+
// On platforms with weaker memory model we need to make sure we use a store with the release semantic
8927+
// when we publish a potentially mutable object
8928+
// See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and
8929+
// https://github.com/dotnet/runtime/pull/76112#discussion_r980639782
89258930

8926-
// This can be relaxed to "just make sure to use stlr/memory barrier" if needed
8927-
ind->gtFlags |= GTF_IND_VOLATILE;
8928-
}
8931+
// This can be relaxed to "just make sure to use stlr/memory barrier" if needed
8932+
ind->gtFlags |= GTF_IND_VOLATILE;
89298933
}
8934+
}
89308935
#endif
89318936

8932-
LowerStoreIndirCoalescing(ind);
8933-
LowerStoreIndir(ind);
8934-
}
8937+
LowerStoreIndirCoalescing(ind);
8938+
return LowerStoreIndir(ind);
89358939
}
89368940

89378941
//------------------------------------------------------------------------
@@ -9014,7 +9018,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind)
90149018
#ifdef TARGET_ARM64
90159019
if (comp->opts.OptimizationEnabled() && ind->OperIs(GT_IND))
90169020
{
9017-
OptimizeForLdp(ind);
9021+
OptimizeForLdpStp(ind);
90189022
}
90199023
#endif
90209024

@@ -9029,7 +9033,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind)
90299033
// cases passing the distance check, but 82 out of these 112 extra cases were
90309034
// then rejected due to interference. So 16 seems like a good number to balance
90319035
// the throughput costs.
9032-
const int LDP_REORDERING_MAX_DISTANCE = 16;
9036+
const int LDP_STP_REORDERING_MAX_DISTANCE = 16;
90339037

90349038
//------------------------------------------------------------------------
90359039
// OptimizeForLdp: Record information about an indirection, and try to optimize
@@ -9042,7 +9046,7 @@ const int LDP_REORDERING_MAX_DISTANCE = 16;
90429046
// Returns:
90439047
// True if the optimization was successful.
90449048
//
9045-
bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
9049+
bool Lowering::OptimizeForLdpStp(GenTreeIndir* ind)
90469050
{
90479051
if (!ind->TypeIs(TYP_INT, TYP_LONG, TYP_FLOAT, TYP_DOUBLE, TYP_SIMD8, TYP_SIMD16) || ind->IsVolatile())
90489052
{
@@ -9060,7 +9064,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
90609064

90619065
// Every indirection takes an expected 2+ nodes, so we only expect at most
90629066
// half the reordering distance to be candidates for the optimization.
9063-
int maxCount = min(m_blockIndirs.Height(), LDP_REORDERING_MAX_DISTANCE / 2);
9067+
int maxCount = min(m_blockIndirs.Height(), LDP_STP_REORDERING_MAX_DISTANCE / 2);
90649068
for (int i = 0; i < maxCount; i++)
90659069
{
90669070
SavedIndir& prev = m_blockIndirs.TopRef(i);
@@ -9075,11 +9079,22 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
90759079
continue;
90769080
}
90779081

9082+
if (prevIndir->gtNext == nullptr)
9083+
{
9084+
// Deleted by other optimization
9085+
continue;
9086+
}
9087+
9088+
if (prevIndir->OperIsStore() != ind->OperIsStore())
9089+
{
9090+
continue;
9091+
}
9092+
90789093
JITDUMP("[%06u] and [%06u] are indirs off the same base with offsets +%03u and +%03u\n",
90799094
Compiler::dspTreeID(ind), Compiler::dspTreeID(prevIndir), (unsigned)offs, (unsigned)prev.Offset);
90809095
if (std::abs(offs - prev.Offset) == genTypeSize(ind))
90819096
{
9082-
JITDUMP(" ..and they are amenable to ldp optimization\n");
9097+
JITDUMP(" ..and they are amenable to ldp/stp optimization\n");
90839098
if (TryMakeIndirsAdjacent(prevIndir, ind))
90849099
{
90859100
// Do not let the previous one participate in
@@ -9115,7 +9130,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind)
91159130
bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir)
91169131
{
91179132
GenTree* cur = prevIndir;
9118-
for (int i = 0; i < LDP_REORDERING_MAX_DISTANCE; i++)
9133+
for (int i = 0; i < LDP_STP_REORDERING_MAX_DISTANCE; i++)
91199134
{
91209135
cur = cur->gtNext;
91219136
if (cur == indir)
@@ -9172,8 +9187,16 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
91729187
INDEBUG(dumpWithMarks());
91739188
JITDUMP("\n");
91749189

9190+
if ((prevIndir->gtLIRFlags & LIR::Flags::Mark) != 0)
9191+
{
9192+
JITDUMP("Previous indir is part of the data flow of current indir\n");
9193+
UnmarkTree(indir);
9194+
return false;
9195+
}
9196+
91759197
m_scratchSideEffects.Clear();
91769198

9199+
bool sawData = false;
91779200
for (GenTree* cur = prevIndir->gtNext; cur != indir; cur = cur->gtNext)
91789201
{
91799202
if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0)
@@ -9186,6 +9209,11 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
91869209
UnmarkTree(indir);
91879210
return false;
91889211
}
9212+
9213+
if (indir->OperIsStore())
9214+
{
9215+
sawData |= cur == indir->Data();
9216+
}
91899217
}
91909218
else
91919219
{
@@ -9197,6 +9225,13 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
91979225

91989226
if (m_scratchSideEffects.InterferesWith(comp, indir, true))
91999227
{
9228+
if (!indir->OperIsLoad())
9229+
{
9230+
JITDUMP("Have conservative interference with last store. Giving up.\n");
9231+
UnmarkTree(indir);
9232+
return false;
9233+
}
9234+
92009235
// Try a bit harder, making use of the following facts:
92019236
//
92029237
// 1. We know the indir is non-faulting, so we do not need to worry
@@ -9293,8 +9328,39 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
92939328
}
92949329
}
92959330

9296-
JITDUMP("Interference checks passed. Moving nodes that are not part of data flow of [%06u]\n\n",
9297-
Compiler::dspTreeID(indir));
9331+
JITDUMP("Interference checks passed: can move unrelated nodes past second indir.\n");
9332+
9333+
if (sawData)
9334+
{
9335+
// If the data node of 'indir' is between 'prevIndir' and 'indir' then
9336+
// try to move the previous indir up to happen after the data
9337+
// computation. We will be moving all nodes unrelated to the data flow
9338+
// past 'indir', so we only need to check interference between
9339+
// 'prevIndir' and all nodes that are part of 'indir's dataflow.
9340+
m_scratchSideEffects.Clear();
9341+
m_scratchSideEffects.AddNode(comp, prevIndir);
9342+
9343+
for (GenTree* cur = prevIndir->gtNext;; cur = cur->gtNext)
9344+
{
9345+
if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0)
9346+
{
9347+
if (m_scratchSideEffects.InterferesWith(comp, cur, true))
9348+
{
9349+
JITDUMP("Cannot move prev indir [%06u] up past [%06u] to get it past the data computation\n",
9350+
Compiler::dspTreeID(prevIndir), Compiler::dspTreeID(cur));
9351+
UnmarkTree(indir);
9352+
return false;
9353+
}
9354+
}
9355+
9356+
if (cur == indir->Data())
9357+
{
9358+
break;
9359+
}
9360+
}
9361+
}
9362+
9363+
JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir));
92989364

92999365
GenTree* previous = prevIndir;
93009366
for (GenTree* node = prevIndir->gtNext;;)
@@ -9317,6 +9383,22 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi
93179383
node = next;
93189384
}
93199385

9386+
if (sawData)
9387+
{
9388+
// For some reason LSRA is not able to reuse a constant if both LIR
9389+
// temps are live simultaneously, so skip moving in those cases and
9390+
// expect LSRA to reuse the constant instead.
9391+
if (indir->Data()->OperIs(GT_CNS_INT, GT_CNS_DBL) && GenTree::Compare(indir->Data(), prevIndir->Data()))
9392+
{
9393+
JITDUMP("Not moving previous indir since we are expecting constant reuse for the data\n");
9394+
}
9395+
else
9396+
{
9397+
BlockRange().Remove(prevIndir);
9398+
BlockRange().InsertAfter(indir->Data(), prevIndir);
9399+
}
9400+
}
9401+
93209402
JITDUMP("Result:\n\n");
93219403
INDEBUG(dumpWithMarks());
93229404
JITDUMP("\n");

src/coreclr/jit/lower.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -345,13 +345,13 @@ class Lowering final : public Phase
345345
bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const;
346346

347347
// Per tree node member functions
348-
void LowerStoreIndirCommon(GenTreeStoreInd* ind);
348+
GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind);
349349
GenTree* LowerIndir(GenTreeIndir* ind);
350-
bool OptimizeForLdp(GenTreeIndir* ind);
350+
bool OptimizeForLdpStp(GenTreeIndir* ind);
351351
bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir);
352352
void MarkTree(GenTree* root);
353353
void UnmarkTree(GenTree* root);
354-
void LowerStoreIndir(GenTreeStoreInd* node);
354+
GenTree* LowerStoreIndir(GenTreeStoreInd* node);
355355
void LowerStoreIndirCoalescing(GenTreeIndir* node);
356356
GenTree* LowerAdd(GenTreeOp* node);
357357
GenTree* LowerMul(GenTreeOp* mul);

src/coreclr/jit/lowerarmarch.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -491,11 +491,21 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
491491
// node - The indirect store node (GT_STORE_IND) of interest
492492
//
493493
// Return Value:
494-
// None.
494+
// Next node to lower.
495495
//
496-
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
496+
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
497497
{
498+
GenTree* next = node->gtNext;
498499
ContainCheckStoreIndir(node);
500+
501+
#ifdef TARGET_ARM64
502+
if (comp->opts.OptimizationEnabled())
503+
{
504+
OptimizeForLdpStp(node);
505+
}
506+
#endif
507+
508+
return next;
499509
}
500510

501511
//------------------------------------------------------------------------

src/coreclr/jit/lowerloongarch64.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -266,11 +266,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
266266
// node - The indirect store node (GT_STORE_IND) of interest
267267
//
268268
// Return Value:
269-
// None.
269+
// Next node to lower.
270270
//
271-
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
271+
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
272272
{
273273
ContainCheckStoreIndir(node);
274+
return node->gtNext;
274275
}
275276

276277
//------------------------------------------------------------------------

src/coreclr/jit/lowerriscv64.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -215,11 +215,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
215215
// node - The indirect store node (GT_STORE_IND) of interest
216216
//
217217
// Return Value:
218-
// None.
218+
// Next node to lower.
219219
//
220-
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
220+
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
221221
{
222222
ContainCheckStoreIndir(node);
223+
return node->gtNext;
223224
}
224225

225226
//------------------------------------------------------------------------

src/coreclr/jit/lowerxarch.cpp

+8-6
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
7373
// node - The indirect store node (GT_STORE_IND) of interest
7474
//
7575
// Return Value:
76-
// None.
76+
// Next node to lower.
7777
//
78-
void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
78+
GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node)
7979
{
8080
// Mark all GT_STOREIND nodes to indicate that it is not known
8181
// whether it represents a RMW memory op.
@@ -92,7 +92,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
9292
// SSE2 doesn't support RMW form of instructions.
9393
if (LowerRMWMemOp(node))
9494
{
95-
return;
95+
return node->gtNext;
9696
}
9797
}
9898

@@ -109,23 +109,25 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
109109
{
110110
if (!node->Data()->IsCnsVec())
111111
{
112-
return;
112+
return node->gtNext;
113113
}
114114

115115
if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64))
116116
{
117-
return;
117+
return node->gtNext;
118118
}
119119

120120
if (node->Data()->IsVectorAllBitsSet() || node->Data()->IsVectorZero())
121121
{
122122
// To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors.
123-
return;
123+
return node->gtNext;
124124
}
125125

126126
TryCompressConstVecData(node);
127127
}
128128
#endif
129+
130+
return node->gtNext;
129131
}
130132

131133
//----------------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)