Skip to content

Commit c11f36e

Browse files
committed
8356708: C2: loop strip mining expansion doesn't take sunk stores into account
Reviewed-by: rcastanedalo, epeter
1 parent 8f121a1 commit c11f36e

File tree

3 files changed

+281
-27
lines changed

3 files changed

+281
-27
lines changed

src/hotspot/share/opto/loopnode.cpp

Lines changed: 140 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -298,19 +298,52 @@ IdealLoopTree* PhaseIdealLoop::insert_outer_loop(IdealLoopTree* loop, LoopNode*
298298
return outer_ilt;
299299
}
300300

301-
// Create a skeleton strip mined outer loop: a Loop head before the
302-
// inner strip mined loop, a safepoint and an exit condition guarded
303-
// by an opaque node after the inner strip mined loop with a backedge
304-
// to the loop head. The inner strip mined loop is left as it is. Only
305-
// once loop optimizations are over, do we adjust the inner loop exit
306-
// condition to limit its number of iterations, set the outer loop
307-
// exit condition and add Phis to the outer loop head. Some loop
308-
// optimizations that operate on the inner strip mined loop need to be
309-
// aware of the outer strip mined loop: loop unswitching needs to
310-
// clone the outer loop as well as the inner, unrolling needs to only
311-
// clone the inner loop etc. No optimizations need to change the outer
312-
// strip mined loop as it is only a skeleton.
313-
IdealLoopTree* PhaseIdealLoop::create_outer_strip_mined_loop(BoolNode *test, Node *cmp, Node *init_control,
301+
// Create a skeleton strip mined outer loop: an OuterStripMinedLoop head before the inner strip mined CountedLoop, a
302+
// SafePoint on exit of the inner CountedLoopEnd and an OuterStripMinedLoopEnd test that can't constant fold until loop
303+
// optimizations are over. The inner strip mined loop is left as it is. Only once loop optimizations are over, do we
304+
// adjust the inner loop exit condition to limit its number of iterations, set the outer loop exit condition and add
305+
// Phis to the outer loop head. Some loop optimizations that operate on the inner strip mined loop need to be aware of
306+
// the outer strip mined loop: loop unswitching needs to clone the outer loop as well as the inner, unrolling needs to
307+
// only clone the inner loop etc. No optimizations need to change the outer strip mined loop as it is only a skeleton.
308+
//
309+
// Schematically:
310+
//
311+
// OuterStripMinedLoop -------|
312+
// | |
313+
// CountedLoop ----------- | |
314+
// \- Phi (iv) -| | |
315+
// / \ | | |
316+
// CmpI AddI --| | |
317+
// \ | |
318+
// Bool | |
319+
// \ | |
320+
// CountedLoopEnd | |
321+
// / \ | |
322+
// IfFalse IfTrue--------| |
323+
// | |
324+
// SafePoint |
325+
// | |
326+
// OuterStripMinedLoopEnd |
327+
// / \ |
328+
// IfFalse IfTrue-----------|
329+
// |
330+
//
331+
//
332+
// As loop optimizations transform the inner loop, the outer strip mined loop stays mostly unchanged. The only exception
333+
// is nodes referenced from the SafePoint and sunk from the inner loop: they end up in the outer strip mined loop.
334+
//
335+
// Not adding Phis to the outer loop head from the beginning, and only adding them after loop optimizations does not
336+
// conform to C2's IR rules: any variable or memory slice that is mutated in a loop should have a Phi. The main
337+
// motivation for such a design that doesn't conform to C2's IR rules is to allow existing loop optimizations to be
338+
// mostly unaffected by the outer strip mined loop: the only extra step needed in most cases is to step over the
339+
// OuterStripMinedLoop. The main drawback is that once loop optimizations are over, an extra step is needed to finish
340+
// constructing the outer loop. This is handled by OuterStripMinedLoopNode::adjust_strip_mined_loop().
341+
//
342+
// Adding Phis to the outer loop is largely straightforward: there needs to be one Phi in the outer loop for every Phi
343+
// in the inner loop. Things may be more complicated for sunk Store nodes: there may not be any inner loop Phi left
344+
// after sinking for a particular memory slice but the outer loop needs a Phi. See
345+
// OuterStripMinedLoopNode::handle_sunk_stores_when_finishing_construction()
346+
IdealLoopTree* PhaseIdealLoop::create_outer_strip_mined_loop(Node* init_control,
314347
IdealLoopTree* loop, float cl_prob, float le_fcnt,
315348
Node*& entry_control, Node*& iffalse) {
316349
Node* outer_test = intcon(0);
@@ -2255,9 +2288,8 @@ bool PhaseIdealLoop::is_counted_loop(Node* x, IdealLoopTree*&loop, BasicType iv_
22552288
is_deleteable_safept(sfpt);
22562289
IdealLoopTree* outer_ilt = nullptr;
22572290
if (strip_mine_loop) {
2258-
outer_ilt = create_outer_strip_mined_loop(test, cmp, init_control, loop,
2259-
cl_prob, le->_fcnt, entry_control,
2260-
iffalse);
2291+
outer_ilt = create_outer_strip_mined_loop(init_control, loop, cl_prob, le->_fcnt,
2292+
entry_control, iffalse);
22612293
}
22622294

22632295
// Now setup a new CountedLoopNode to replace the existing LoopNode
@@ -2870,10 +2902,11 @@ BaseCountedLoopNode* BaseCountedLoopNode::make(Node* entry, Node* backedge, Basi
28702902
return new LongCountedLoopNode(entry, backedge);
28712903
}
28722904

2873-
void OuterStripMinedLoopNode::fix_sunk_stores(CountedLoopEndNode* inner_cle, LoopNode* inner_cl, PhaseIterGVN* igvn,
2874-
PhaseIdealLoop* iloop) {
2875-
Node* cle_out = inner_cle->proj_out(false);
2876-
Node* cle_tail = inner_cle->proj_out(true);
2905+
void OuterStripMinedLoopNode::fix_sunk_stores_when_back_to_counted_loop(PhaseIterGVN* igvn,
2906+
PhaseIdealLoop* iloop) const {
2907+
CountedLoopNode* inner_cl = inner_counted_loop();
2908+
IfFalseNode* cle_out = inner_loop_exit();
2909+
28772910
if (cle_out->outcnt() > 1) {
28782911
// Look for chains of stores that were sunk
28792912
// out of the inner loop and are in the outer loop
@@ -2988,11 +3021,90 @@ void OuterStripMinedLoopNode::fix_sunk_stores(CountedLoopEndNode* inner_cle, Loo
29883021
}
29893022
}
29903023

3024+
// The outer strip mined loop is initially only partially constructed. In particular Phis are omitted.
3025+
// See comment above: PhaseIdealLoop::create_outer_strip_mined_loop()
3026+
// We're now in the process of finishing the construction of the outer loop. For each Phi in the inner loop, a Phi in
3027+
// the outer loop was just now created. However, Sunk Stores cause an extra challenge:
3028+
// 1) If all Stores in the inner loop were sunk for a particular memory slice, there's no Phi left for that memory slice
3029+
// in the inner loop anymore, and hence we did not yet add a Phi for the outer loop. So an extra Phi must now be
3030+
// added for each chain of sunk Stores for a particular memory slice.
3031+
// 2) If some Stores were sunk and some left in the inner loop, a Phi was already created in the outer loop but
3032+
// its backedge input wasn't wired correctly to the last Store of the chain: the backedge input was set to the
3033+
// backedge of the inner loop Phi instead, but it needs to be the last Store of the chain in the outer loop. We now
3034+
// have to fix that too.
3035+
void OuterStripMinedLoopNode::handle_sunk_stores_when_finishing_construction(PhaseIterGVN* igvn) {
3036+
IfFalseNode* cle_exit_proj = inner_loop_exit();
3037+
3038+
// Find Sunk stores: Sunk stores are pinned on the loop exit projection of the inner loop. Indeed, because Sunk Stores
3039+
// modify the memory state captured by the SafePoint in the outer strip mined loop, they must be above it. The
3040+
// SafePoint's control input is the loop exit projection. It's also the only control out of the inner loop above the
3041+
// SafePoint.
3042+
#ifdef ASSERT
3043+
int stores_in_outer_loop_cnt = 0;
3044+
for (DUIterator_Fast imax, i = cle_exit_proj->fast_outs(imax); i < imax; i++) {
3045+
Node* u = cle_exit_proj->fast_out(i);
3046+
if (u->is_Store()) {
3047+
stores_in_outer_loop_cnt++;
3048+
}
3049+
}
3050+
#endif
3051+
3052+
// Sunk stores are reachable from the memory state of the outer loop safepoint
3053+
SafePointNode* safepoint = outer_safepoint();
3054+
MergeMemNode* mm = safepoint->in(TypeFunc::Memory)->isa_MergeMem();
3055+
if (mm == nullptr) {
3056+
// There is no MergeMem, which should only happen if there was no memory node
3057+
// sunk out of the loop.
3058+
assert(stores_in_outer_loop_cnt == 0, "inconsistent");
3059+
return;
3060+
}
3061+
DEBUG_ONLY(int stores_in_outer_loop_cnt2 = 0);
3062+
for (MergeMemStream mms(mm); mms.next_non_empty();) {
3063+
Node* mem = mms.memory();
3064+
// Traverse up the chain of stores to find the first store pinned
3065+
// at the loop exit projection.
3066+
Node* last = mem;
3067+
Node* first = nullptr;
3068+
while (mem->is_Store() && mem->in(0) == cle_exit_proj) {
3069+
DEBUG_ONLY(stores_in_outer_loop_cnt2++);
3070+
first = mem;
3071+
mem = mem->in(MemNode::Memory);
3072+
}
3073+
if (first != nullptr) {
3074+
// Found a chain of Stores that were sunk
3075+
// Do we already have a memory Phi for that slice on the outer loop? If that is the case, that Phi was created
3076+
// by cloning an inner loop Phi. The inner loop Phi should have mem, the memory state of the first Store out of
3077+
// the inner loop, as input on the backedge. So does the outer loop Phi given it's a clone.
3078+
Node* phi = nullptr;
3079+
for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
3080+
Node* u = mem->fast_out(i);
3081+
if (u->is_Phi() && u->in(0) == this && u->in(LoopBackControl) == mem) {
3082+
assert(phi == nullptr, "there should be only one");
3083+
phi = u;
3084+
PRODUCT_ONLY(break);
3085+
}
3086+
}
3087+
if (phi == nullptr) {
3088+
// No outer loop Phi? create one
3089+
phi = PhiNode::make(this, last);
3090+
phi->set_req(EntryControl, mem);
3091+
phi = igvn->transform(phi);
3092+
igvn->replace_input_of(first, MemNode::Memory, phi);
3093+
} else {
3094+
// Fix memory state along the backedge: it should be the last sunk Store of the chain
3095+
igvn->replace_input_of(phi, LoopBackControl, last);
3096+
}
3097+
}
3098+
}
3099+
assert(stores_in_outer_loop_cnt == stores_in_outer_loop_cnt2, "inconsistent");
3100+
}
3101+
29913102
void OuterStripMinedLoopNode::adjust_strip_mined_loop(PhaseIterGVN* igvn) {
3103+
verify_strip_mined(1);
29923104
// Look for the outer & inner strip mined loop, reduce number of
29933105
// iterations of the inner loop, set exit condition of outer loop,
29943106
// construct required phi nodes for outer loop.
2995-
CountedLoopNode* inner_cl = unique_ctrl_out()->as_CountedLoop();
3107+
CountedLoopNode* inner_cl = inner_counted_loop();
29963108
assert(inner_cl->is_strip_mined(), "inner loop should be strip mined");
29973109
if (LoopStripMiningIter == 0) {
29983110
remove_outer_loop_and_safepoint(igvn);
@@ -3010,7 +3122,7 @@ void OuterStripMinedLoopNode::adjust_strip_mined_loop(PhaseIterGVN* igvn) {
30103122
inner_cl->clear_strip_mined();
30113123
return;
30123124
}
3013-
CountedLoopEndNode* inner_cle = inner_cl->loopexit();
3125+
CountedLoopEndNode* inner_cle = inner_counted_loop_end();
30143126

30153127
int stride = inner_cl->stride_con();
30163128
// For a min int stride, LoopStripMiningIter * stride overflows the int range for all values of LoopStripMiningIter
@@ -3091,8 +3203,9 @@ void OuterStripMinedLoopNode::adjust_strip_mined_loop(PhaseIterGVN* igvn) {
30913203
}
30923204

30933205
Node* iv_phi = nullptr;
3094-
// Make a clone of each phi in the inner loop
3095-
// for the outer loop
3206+
// Make a clone of each phi in the inner loop for the outer loop
3207+
// When Stores were Sunk, after this step, a Phi may still be missing or its backedge incorrectly wired. See
3208+
// handle_sunk_stores_when_finishing_construction()
30963209
for (uint i = 0; i < inner_cl->outcnt(); i++) {
30973210
Node* u = inner_cl->raw_out(i);
30983211
if (u->is_Phi()) {
@@ -3111,6 +3224,8 @@ void OuterStripMinedLoopNode::adjust_strip_mined_loop(PhaseIterGVN* igvn) {
31113224
}
31123225
}
31133226

3227+
handle_sunk_stores_when_finishing_construction(igvn);
3228+
31143229
if (iv_phi != nullptr) {
31153230
// Now adjust the inner loop's exit condition
31163231
Node* limit = inner_cl->limit();
@@ -3166,7 +3281,7 @@ void OuterStripMinedLoopNode::transform_to_counted_loop(PhaseIterGVN* igvn, Phas
31663281
CountedLoopEndNode* inner_cle = inner_cl->loopexit();
31673282
Node* safepoint = outer_safepoint();
31683283

3169-
fix_sunk_stores(inner_cle, inner_cl, igvn, iloop);
3284+
fix_sunk_stores_when_back_to_counted_loop(igvn, iloop);
31703285

31713286
// make counted loop exit test always fail
31723287
ConINode* zero = igvn->intcon(0);

src/hotspot/share/opto/loopnode.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,8 @@ class LoopLimitNode : public Node {
573573
// Support for strip mining
574574
class OuterStripMinedLoopNode : public LoopNode {
575575
private:
576-
static void fix_sunk_stores(CountedLoopEndNode* inner_cle, LoopNode* inner_cl, PhaseIterGVN* igvn, PhaseIdealLoop* iloop);
576+
void fix_sunk_stores_when_back_to_counted_loop(PhaseIterGVN* igvn, PhaseIdealLoop* iloop) const;
577+
void handle_sunk_stores_when_finishing_construction(PhaseIterGVN* igvn);
577578

578579
public:
579580
OuterStripMinedLoopNode(Compile* C, Node *entry, Node *backedge)
@@ -589,6 +590,10 @@ class OuterStripMinedLoopNode : public LoopNode {
589590
virtual OuterStripMinedLoopEndNode* outer_loop_end() const;
590591
virtual IfFalseNode* outer_loop_exit() const;
591592
virtual SafePointNode* outer_safepoint() const;
593+
CountedLoopNode* inner_counted_loop() const { return unique_ctrl_out()->as_CountedLoop(); }
594+
CountedLoopEndNode* inner_counted_loop_end() const { return inner_counted_loop()->loopexit(); }
595+
IfFalseNode* inner_loop_exit() const { return inner_counted_loop_end()->proj_out(false)->as_IfFalse(); }
596+
592597
void adjust_strip_mined_loop(PhaseIterGVN* igvn);
593598

594599
void remove_outer_loop_and_safepoint(PhaseIterGVN* igvn) const;
@@ -1293,7 +1298,7 @@ class PhaseIdealLoop : public PhaseTransform {
12931298
void add_parse_predicate(Deoptimization::DeoptReason reason, Node* inner_head, IdealLoopTree* loop, SafePointNode* sfpt);
12941299
SafePointNode* find_safepoint(Node* back_control, Node* x, IdealLoopTree* loop);
12951300
IdealLoopTree* insert_outer_loop(IdealLoopTree* loop, LoopNode* outer_l, Node* outer_ift);
1296-
IdealLoopTree* create_outer_strip_mined_loop(BoolNode *test, Node *cmp, Node *init_control,
1301+
IdealLoopTree* create_outer_strip_mined_loop(Node* init_control,
12971302
IdealLoopTree* loop, float cl_prob, float le_fcnt,
12981303
Node*& entry_control, Node*& iffalse);
12991304

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Copyright (c) 2025, Red Hat, Inc. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
/**
25+
* @test
26+
* @bug 8356708
27+
* @summary C2: loop strip mining expansion doesn't take sunk stores into account
28+
*
29+
* @run main/othervm -XX:-TieredCompilation -XX:-UseOnStackReplacement -XX:-BackgroundCompilation -XX:LoopMaxUnroll=0
30+
* -XX:+UnlockDiagnosticVMOptions -XX:+StressGCM -XX:StressSeed=26601954 TestStoresSunkInOuterStripMinedLoop
31+
* @run main/othervm -XX:-TieredCompilation -XX:-UseOnStackReplacement -XX:-BackgroundCompilation -XX:LoopMaxUnroll=0
32+
* -XX:+UnlockDiagnosticVMOptions -XX:+StressGCM TestStoresSunkInOuterStripMinedLoop
33+
* @run main TestStoresSunkInOuterStripMinedLoop
34+
*
35+
*/
36+
37+
public class TestStoresSunkInOuterStripMinedLoop {
38+
private static int field;
39+
private static volatile int volatileField;
40+
41+
public static void main(String[] args) {
42+
A a1 = new A();
43+
A a2 = new A();
44+
A a3 = new A();
45+
for (int i = 0; i < 20_000; i++) {
46+
field = 0;
47+
test1();
48+
if (field != 1500) {
49+
throw new RuntimeException(field + " != 1500");
50+
}
51+
a1.field = 0;
52+
test2(a1, a2);
53+
if (a1.field != 1500) {
54+
throw new RuntimeException(a1.field + " != 1500");
55+
}
56+
a1.field = 0;
57+
test3(a1, a2);
58+
if (a1.field != 1500) {
59+
throw new RuntimeException(a1.field + " != 1500");
60+
}
61+
a1.field = 0;
62+
test4(a1, a2, a3);
63+
if (a1.field != 1500) {
64+
throw new RuntimeException(a1.field + " != 1500");
65+
}
66+
}
67+
}
68+
69+
// Single store sunk in outer loop, no store in inner loop
70+
private static float test1() {
71+
int v = field;
72+
float f = 1;
73+
for (int i = 0; i < 1500; i++) {
74+
f *= 2;
75+
v++;
76+
field = v;
77+
}
78+
return f;
79+
}
80+
81+
// Multiple stores sunk in outer loop, no store in inner loop
82+
private static float test2(A a1, A a2) {
83+
field = a1.field + a2.field;
84+
volatileField = 42;
85+
int v = a1.field;
86+
float f = 1;
87+
for (int i = 0; i < 1500; i++) {
88+
f *= 2;
89+
v++;
90+
a1.field = v;
91+
a2.field = v;
92+
}
93+
return f;
94+
}
95+
96+
// Store sunk in outer loop, store in inner loop
97+
private static float test3(A a1, A a2) {
98+
field = a1.field + a2.field;
99+
volatileField = 42;
100+
int v = a1.field;
101+
float f = 1;
102+
A a = a2;
103+
for (int i = 0; i < 1500; i++) {
104+
f *= 2;
105+
v++;
106+
a.field = v;
107+
a = a1;
108+
a2.field = v;
109+
}
110+
return f;
111+
}
112+
113+
// Multiple stores sunk in outer loop, store in inner loop
114+
private static float test4(A a1, A a2, A a3) {
115+
field = a1.field + a2.field + a3.field;
116+
volatileField = 42;
117+
int v = a1.field;
118+
float f = 1;
119+
A a = a2;
120+
for (int i = 0; i < 1500; i++) {
121+
f *= 2;
122+
v++;
123+
a.field = v;
124+
a = a1;
125+
a2.field = v;
126+
a3.field = v;
127+
}
128+
return f;
129+
}
130+
131+
static class A {
132+
int field;
133+
}
134+
}

0 commit comments

Comments
 (0)