Skip to content

Commit e82031e

Browse files
committed
8350756: C2 SuperWord Multiversioning: remove useless slow loop when the fast loop disappears
Reviewed-by: kvn, chagedorn
1 parent 3626ac3 commit e82031e

File tree

7 files changed

+234
-3
lines changed

7 files changed

+234
-3
lines changed

src/hotspot/share/opto/loopnode.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "opto/movenode.hpp"
4242
#include "opto/mulnode.hpp"
4343
#include "opto/opaquenode.hpp"
44+
#include "opto/opcodes.hpp"
4445
#include "opto/predicates.hpp"
4546
#include "opto/rootnode.hpp"
4647
#include "opto/runtime.hpp"
@@ -2731,6 +2732,20 @@ Node* CountedLoopNode::match_incr_with_optional_truncation(Node* expr, Node** tr
27312732
return nullptr;
27322733
}
27332734

2735+
IfNode* CountedLoopNode::find_multiversion_if_from_multiversion_fast_main_loop() {
2736+
assert(is_main_loop() && is_multiversion_fast_loop(), "must be multiversion fast main loop");
2737+
CountedLoopEndNode* pre_end = find_pre_loop_end();
2738+
if (pre_end == nullptr) { return nullptr; }
2739+
Node* pre_entry = pre_end->loopnode()->in(LoopNode::EntryControl);
2740+
const Predicates predicates(pre_entry);
2741+
IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue();
2742+
if (before_predicates != nullptr &&
2743+
before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) {
2744+
return before_predicates->in(0)->as_If();
2745+
}
2746+
return nullptr;
2747+
}
2748+
27342749
LoopNode* CountedLoopNode::skip_strip_mined(int expect_skeleton) {
27352750
if (is_strip_mined() && in(EntryControl) != nullptr && in(EntryControl)->is_OuterStripMinedLoop()) {
27362751
verify_strip_mined(expect_skeleton);
@@ -4536,6 +4551,49 @@ void PhaseIdealLoop::eliminate_useless_zero_trip_guard() {
45364551
}
45374552
}
45384553

4554+
void PhaseIdealLoop::eliminate_useless_multiversion_if() {
4555+
if (_multiversion_opaque_nodes.size() == 0) {
4556+
return;
4557+
}
4558+
4559+
ResourceMark rm;
4560+
Unique_Node_List useful_multiversioning_opaque_nodes;
4561+
4562+
// The OpaqueMultiversioning is only used from the fast main loop in AutoVectorization, to add
4563+
// speculative runtime-checks to the multiversion_if. Thus, a OpaqueMultiversioning is only
4564+
// useful if it can be found from a fast main loop. If it can not be found from a fast main loop,
4565+
// then we cannot ever use that multiversion_if to add more speculative runtime-checks, and hence
4566+
// it is useless. If it is still in delayed mode, i.e. has not yet had any runtime-checks added,
4567+
// then we can let it constant fold towards the fast loop.
4568+
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
4569+
IdealLoopTree* lpt = iter.current();
4570+
if (lpt->_child == nullptr && lpt->is_counted()) {
4571+
CountedLoopNode* head = lpt->_head->as_CountedLoop();
4572+
if (head->is_main_loop() && head->is_multiversion_fast_loop()) {
4573+
// There are fast_loop pre/main/post loops, but the finding traversal starts at the main
4574+
// loop, and traverses via the fast pre loop to the multiversion_if.
4575+
IfNode* multiversion_if = head->find_multiversion_if_from_multiversion_fast_main_loop();
4576+
if (multiversion_if != nullptr) {
4577+
useful_multiversioning_opaque_nodes.push(multiversion_if->in(1)->as_OpaqueMultiversioning());
4578+
}
4579+
}
4580+
}
4581+
}
4582+
4583+
for (uint i = 0; i < _multiversion_opaque_nodes.size(); i++) {
4584+
OpaqueMultiversioningNode* opaque = _multiversion_opaque_nodes.at(i)->as_OpaqueMultiversioning();
4585+
if (!useful_multiversioning_opaque_nodes.member(opaque)) {
4586+
if (opaque->is_delayed_slow_loop()) {
4587+
// We cannot hack the node directly, otherwise the slow_loop will complain that it cannot
4588+
// find the multiversioning opaque node. Instead, we mark the opaque node as useless, and
4589+
// it can be constant folded during IGVN.
4590+
opaque->mark_useless();
4591+
_igvn._worklist.push(opaque);
4592+
}
4593+
}
4594+
}
4595+
}
4596+
45394597
//------------------------process_expensive_nodes-----------------------------
45404598
// Expensive nodes have their control input set to prevent the GVN
45414599
// from commoning them and as a result forcing the resulting node to
@@ -4805,6 +4863,7 @@ void PhaseIdealLoop::build_and_optimize() {
48054863
}
48064864

48074865
eliminate_useless_zero_trip_guard();
4866+
eliminate_useless_multiversion_if();
48084867

48094868
if (stop_early) {
48104869
assert(do_expensive_nodes, "why are we here?");
@@ -6596,6 +6655,9 @@ void PhaseIdealLoop::build_loop_late_post_work(Node *n, bool pinned) {
65966655
_zero_trip_guard_opaque_nodes.push(n);
65976656
}
65986657

6658+
if (!_verify_only && n->Opcode() == Op_OpaqueMultiversioning) {
6659+
_multiversion_opaque_nodes.push(n);
6660+
}
65996661
}
66006662

66016663
#ifdef ASSERT

src/hotspot/share/opto/loopnode.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ class CountedLoopNode : public BaseCountedLoopNode {
290290
bool has_atomic_post_loop () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
291291
void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
292292

293+
IfNode* find_multiversion_if_from_multiversion_fast_main_loop();
294+
293295
int main_idx() const { return _main_idx; }
294296

295297

@@ -932,6 +934,7 @@ class PhaseIdealLoop : public PhaseTransform {
932934
// clear out dead code after build_loop_late
933935
Node_List _deadlist;
934936
Node_List _zero_trip_guard_opaque_nodes;
937+
Node_List _multiversion_opaque_nodes;
935938

936939
// Support for faster execution of get_late_ctrl()/dom_lca()
937940
// when a node has many uses and dominator depth is deep.
@@ -1453,6 +1456,7 @@ class PhaseIdealLoop : public PhaseTransform {
14531456
void eliminate_useless_template_assertion_predicates(Unique_Node_List& useful_predicates);
14541457

14551458
void eliminate_useless_zero_trip_guard();
1459+
void eliminate_useless_multiversion_if();
14561460

14571461
public:
14581462
// Change the control input of expensive nodes to allow commoning by

src/hotspot/share/opto/loopopts.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -791,7 +791,10 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
791791
// Ignore Template Assertion Predicates with OpaqueTemplateAssertionPredicate nodes.
792792
return nullptr;
793793
}
794-
assert(bol->Opcode() == Op_Bool, "Unexpected node");
794+
if (!bol->is_Bool()) {
795+
assert(false, "Expected Bool, but got %s", NodeClassNames[bol->Opcode()]);
796+
return nullptr;
797+
}
795798
int cmp_op = bol->in(1)->Opcode();
796799
if (cmp_op == Op_SubTypeCheck) { // SubTypeCheck expansion expects an IfNode
797800
return nullptr;

src/hotspot/share/opto/opaquenode.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,26 @@ IfNode* OpaqueZeroTripGuardNode::if_node() const {
8282
return iff->as_If();
8383
}
8484

85+
Node* OpaqueMultiversioningNode::Identity(PhaseGVN* phase) {
86+
// Constant fold the multiversion_if. Since the slow_loop is still delayed,
87+
// i.e. we have not yet added any possibly failing condition, we can just
88+
// take the true branch in all cases.
89+
if (_useless) {
90+
assert(_is_delayed_slow_loop, "the slow_loop should still be delayed");
91+
return in(1);
92+
}
93+
return Opaque1Node::Identity(phase);
94+
}
95+
96+
#ifndef PRODUCT
97+
void OpaqueMultiversioningNode::dump_spec(outputStream *st) const {
98+
Opaque1Node::dump_spec(st);
99+
if (_useless) {
100+
st->print(" #useless");
101+
}
102+
}
103+
#endif
104+
85105
const Type* OpaqueNotNullNode::Value(PhaseGVN* phase) const {
86106
return phase->type(in(1));
87107
}

src/hotspot/share/opto/opaquenode.hpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,30 @@ class OpaqueZeroTripGuardNode : public Opaque1Node {
101101
class OpaqueMultiversioningNode : public Opaque1Node {
102102
private:
103103
bool _is_delayed_slow_loop;
104+
bool _useless;
104105

105106
public:
106107
OpaqueMultiversioningNode(Compile* C, Node* n) :
107-
Opaque1Node(C, n), _is_delayed_slow_loop(true)
108+
Opaque1Node(C, n), _is_delayed_slow_loop(true), _useless(false)
108109
{
109110
init_class_id(Class_OpaqueMultiversioning);
110111
}
111112
virtual int Opcode() const;
112113
virtual const Type* bottom_type() const { return TypeInt::BOOL; }
113114
bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; }
114-
void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; }
115+
116+
void notify_slow_loop_that_it_can_resume_optimizations() {
117+
assert(!_useless, "must still be useful");
118+
_is_delayed_slow_loop = false;
119+
}
120+
121+
void mark_useless() {
122+
assert(_is_delayed_slow_loop, "must still be delayed");
123+
_useless = true;
124+
}
125+
126+
virtual Node* Identity(PhaseGVN* phase);
127+
NOT_PRODUCT(virtual void dump_spec(outputStream* st) const;)
115128
};
116129

117130
// This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though

test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,11 @@ public class IRNode {
329329
superWordNodes(ADD_REDUCTION_VL, "AddReductionVL");
330330
}
331331

332+
public static final String OPAQUE_MULTIVERSIONING = PREFIX + "OPAQUE_MULTIVERSIONING" + POSTFIX;
333+
static {
334+
beforeMatchingNameRegex(OPAQUE_MULTIVERSIONING, "OpaqueMultiversioning");
335+
}
336+
332337
public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX;
333338
static {
334339
String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END;
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
package compiler.loopopts.superword;
25+
26+
import compiler.lib.ir_framework.*;
27+
28+
/*
29+
* @test
30+
* @bug 8350756
31+
* @summary Test case where the multiversion fast_loop disappears, and we should
32+
* constant fold the multiversion_if, to remove the slow_loop.
33+
* @library /test/lib /
34+
* @run driver compiler.loopopts.superword.TestMultiversionRemoveUselessSlowLoop
35+
*/
36+
37+
public class TestMultiversionRemoveUselessSlowLoop {
38+
39+
public static void main(String[] args) {
40+
TestFramework framework = new TestFramework(TestMultiversionRemoveUselessSlowLoop.class);
41+
// No traps means we cannot use the predicates version for SuperWord / AutoVectorization,
42+
// and instead use multiversioning directly.
43+
framework.addFlags("-XX:-TieredCompilation", "-XX:PerMethodTrapLimit=0");
44+
framework.setDefaultWarmup(0); // simulates Xcomp
45+
framework.start();
46+
}
47+
48+
public static final int SIZE = 20;
49+
public static final int[] a = new int[SIZE];
50+
public static final int[] b = new int[SIZE];
51+
public static final int SIZE2 = 10_000;
52+
public static final int[] a2 = new int[SIZE2];
53+
public static final int[] b2 = new int[SIZE2];
54+
55+
@Test
56+
@IR(counts = {"pre .* multiversion_fast", "= 2", // regular pre-main-post for both loops
57+
"main .* multiversion_fast", "= 2",
58+
"post .* multiversion_fast", "= 2",
59+
"multiversion_delayed_slow", "= 2", // both have the delayed slow_loop
60+
"multiversion", "= 8", // nothing unexpected
61+
IRNode.OPAQUE_MULTIVERSIONING, "= 2"}, // Both multiversion_if are still here
62+
applyIfPlatform = {"64-bit", "true"},
63+
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
64+
phase = CompilePhase.PHASEIDEALLOOP1)
65+
@IR(counts = {"pre .* multiversion_fast", "= 2",
66+
"main .* multiversion_fast", "= 1", // The first main loop is fully unrolled
67+
"post .* multiversion_fast", "= 3", // the second loop is vectorized, and has a vectorized post loop
68+
"multiversion_delayed_slow", "= 1", // As a consequence of the first main loop being removed, we constant fold the multiversion_if
69+
"multiversion", "= 7", // nothing unexpected
70+
IRNode.OPAQUE_MULTIVERSIONING, "= 1"}, // The multiversion_if of the first loop was constant folded, because the main loop disappeared.
71+
applyIfPlatform = {"64-bit", "true"},
72+
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
73+
phase = CompilePhase.PHASEIDEALLOOP_ITERATIONS)
74+
@IR(counts = {"pre .* multiversion_fast.*", ">= 1", // In some cases, the pre loop of the first loop also disappears because it only has a single iteration
75+
"pre .* multiversion_fast.*", "<= 2", // but not in other cases the pre loop of the first loop remains.
76+
"main .* multiversion_fast", "= 1",
77+
"post .* multiversion_fast", "= 3",
78+
"multiversion_delayed_slow", "= 0", // The second loop's multiversion_if was also not used, so it is constant folded after loop opts.
79+
"multiversion", ">= 5", // nothing unexpected
80+
"multiversion", "<= 6", // nothing unexpected
81+
IRNode.OPAQUE_MULTIVERSIONING, "= 0"}, // After loop-opts, we also constant fold the multiversion_if of the second loop, as it is unused.
82+
applyIfPlatform = {"64-bit", "true"},
83+
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
84+
phase = CompilePhase.PRINT_IDEAL)
85+
public static void testIR() {
86+
// This loop is short, and the multiversion_fast main loop eventuall is fully unrolled.
87+
for (int i = 0; i < SIZE; i++) {
88+
a[i] = b[i];
89+
}
90+
// We take this second loop with a larger limit so that loop opts keeps going once the loop
91+
// above is fully optimized. It also gives us a reference where the main loop of the
92+
// multiverion fast_loop does not disappear.
93+
for (int i = 0; i < SIZE2; i++) {
94+
a2[i] = b2[i];
95+
}
96+
}
97+
98+
static long instanceCount;
99+
static int iFld;
100+
static int iFld1;
101+
102+
// The inner loop is Multiversioned, then PreMainPost and Unroll.
103+
// Eventually, both the fast and slow loops (pre main and post) disappear,
104+
// and leave us with a simple if-diamond using the multiversion_if.
105+
//
106+
// Verification code in PhaseIdealLoop::conditional_move finds this diamond
107+
// and expects a Bool but gets an OpaqueMultiversioning instead.
108+
//
109+
// If we let the multiversion_if constant fold soon after the main fast loop
110+
// disappears, then this issue does not occur any more.
111+
@Test
112+
public static void testCrash() {
113+
boolean b2 = true;
114+
for (int i = 0; i < 1000; i++) {
115+
for (int i21 = 82; i21 > 9; --i21) {
116+
if (b2)
117+
break;
118+
iFld1 = iFld;
119+
b2 = true;
120+
}
121+
instanceCount = iFld1;
122+
}
123+
}
124+
}

0 commit comments

Comments
 (0)