Skip to content

Commit 5d8d6da

Browse files
author
Sandhya Viswanathan
committed
8286972: Support the new loop induction variable related PopulateIndex IR node on x86
Reviewed-by: kvn, jbhateja
1 parent 8122466 commit 5d8d6da

File tree

4 files changed

+249
-22
lines changed

4 files changed

+249
-22
lines changed

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2274,6 +2274,84 @@ void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegist
22742274
}
22752275
}
22762276

2277+
void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2278+
assert(UseAVX >= 2, "required");
2279+
#ifdef ASSERT
2280+
bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2281+
bool is_bw_supported = VM_Version::supports_avx512bw();
2282+
if (is_bw && !is_bw_supported) {
2283+
assert(vlen_enc != Assembler::AVX_512bit, "required");
2284+
assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2285+
"XMM register should be 0-15");
2286+
}
2287+
#endif // ASSERT
2288+
switch (elem_bt) {
2289+
case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2290+
case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2291+
case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2292+
case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2293+
case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2294+
case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2295+
default: assert(false, "%s", type2name(elem_bt));
2296+
}
2297+
}
2298+
2299+
#ifdef _LP64
2300+
void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2301+
assert(UseAVX >= 2, "required");
2302+
bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2303+
bool is_vl = vlen_enc != Assembler::AVX_512bit;
2304+
if ((UseAVX > 2) &&
2305+
(!is_bw || VM_Version::supports_avx512bw()) &&
2306+
(!is_vl || VM_Version::supports_avx512vl())) {
2307+
switch (elem_bt) {
2308+
case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2309+
case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2310+
case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2311+
case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2312+
default: assert(false, "%s", type2name(elem_bt));
2313+
}
2314+
} else {
2315+
assert(vlen_enc != Assembler::AVX_512bit, "required");
2316+
assert((dst->encoding() < 16),"XMM register should be 0-15");
2317+
switch (elem_bt) {
2318+
case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2319+
case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2320+
case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2321+
case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2322+
case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2323+
case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2324+
default: assert(false, "%s", type2name(elem_bt));
2325+
}
2326+
}
2327+
}
2328+
#endif
2329+
2330+
void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2331+
switch (to_elem_bt) {
2332+
case T_SHORT:
2333+
vpmovsxbw(dst, src, vlen_enc);
2334+
break;
2335+
case T_INT:
2336+
vpmovsxbd(dst, src, vlen_enc);
2337+
break;
2338+
case T_FLOAT:
2339+
vpmovsxbd(dst, src, vlen_enc);
2340+
vcvtdq2ps(dst, dst, vlen_enc);
2341+
break;
2342+
case T_LONG:
2343+
vpmovsxbq(dst, src, vlen_enc);
2344+
break;
2345+
case T_DOUBLE: {
2346+
int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2347+
vpmovsxbd(dst, src, mid_vlen_enc);
2348+
vcvtdq2pd(dst, dst, vlen_enc);
2349+
break;
2350+
}
2351+
default: assert(false, "%s", type2name(to_elem_bt));
2352+
}
2353+
}
2354+
22772355
//-------------------------------------------------------------------------------------------
22782356

22792357
// IndexOf for constant substrings with size >= 8 chars

src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,13 @@
132132
void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
133133
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg);
134134

135+
// Covert B2X
136+
void vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc);
137+
#ifdef _LP64
138+
void vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc);
139+
#endif
140+
void vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);
141+
135142
// blend
136143
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
137144
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len);

src/hotspot/cpu/x86/x86.ad

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,6 +1468,11 @@ const bool Matcher::match_rule_supported(int opcode) {
14681468
return false;
14691469
}
14701470
break;
1471+
case Op_PopulateIndex:
1472+
if (!is_LP64 || (UseAVX < 2)) {
1473+
return false;
1474+
}
1475+
break;
14711476
case Op_RoundVF:
14721477
if (UseAVX < 2) { // enabled for AVX2 only
14731478
return false;
@@ -1811,6 +1816,10 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
18111816
return false; // Implementation limitation
18121817
}
18131818
break;
1819+
case Op_PopulateIndex:
1820+
if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
1821+
return false;
1822+
}
18141823
case Op_VectorCastB2X:
18151824
case Op_VectorCastS2X:
18161825
case Op_VectorCastI2X:
@@ -6918,28 +6927,7 @@ instruct vcastBtoX(vec dst, vec src) %{
69186927

69196928
BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
69206929
int vlen_enc = vector_length_encoding(this);
6921-
switch (to_elem_bt) {
6922-
case T_SHORT:
6923-
__ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6924-
break;
6925-
case T_INT:
6926-
__ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6927-
break;
6928-
case T_FLOAT:
6929-
__ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6930-
__ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6931-
break;
6932-
case T_LONG:
6933-
__ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6934-
break;
6935-
case T_DOUBLE: {
6936-
int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
6937-
__ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
6938-
__ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6939-
break;
6940-
}
6941-
default: assert(false, "%s", type2name(to_elem_bt));
6942-
}
6930+
__ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
69436931
%}
69446932
ins_pipe( pipe_slow );
69456933
%}
@@ -8272,6 +8260,45 @@ instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
82728260
ins_pipe( pipe_slow );
82738261
%}
82748262

8263+
#ifdef _LP64
8264+
instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp, rRegP scratch) %{
8265+
match(Set dst (PopulateIndex src1 src2));
8266+
effect(TEMP dst, TEMP vtmp, TEMP scratch);
8267+
format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp and $scratch as TEMP" %}
8268+
ins_encode %{
8269+
assert($src2$$constant == 1, "required");
8270+
int vlen = Matcher::vector_length(this);
8271+
int vlen_enc = vector_length_encoding(this);
8272+
BasicType elem_bt = Matcher::vector_element_basic_type(this);
8273+
__ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
8274+
__ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen);
8275+
if (elem_bt != T_BYTE) {
8276+
__ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8277+
}
8278+
__ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8279+
%}
8280+
ins_pipe( pipe_slow );
8281+
%}
8282+
8283+
instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp, rRegP scratch) %{
8284+
match(Set dst (PopulateIndex src1 src2));
8285+
effect(TEMP dst, TEMP vtmp, TEMP scratch);
8286+
format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp and $scratch as TEMP" %}
8287+
ins_encode %{
8288+
assert($src2$$constant == 1, "required");
8289+
int vlen = Matcher::vector_length(this);
8290+
int vlen_enc = vector_length_encoding(this);
8291+
BasicType elem_bt = Matcher::vector_element_basic_type(this);
8292+
__ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
8293+
__ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen);
8294+
if (elem_bt != T_BYTE) {
8295+
__ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8296+
}
8297+
__ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8298+
%}
8299+
ins_pipe( pipe_slow );
8300+
%}
8301+
#endif
82758302
//-------------------------------- Rearrange ----------------------------------
82768303

82778304
// LoadShuffle/Rearrange for Byte
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
/**
25+
* @test
26+
* @bug 8286972
27+
* @summary Test vectorization of loop induction variable usage in the loop
28+
* @requires vm.compiler2.enabled
29+
* @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx2.*") |
30+
* (os.simpleArch == "aarch64" & vm.cpu.features ~= ".*sve.*")
31+
* @library /test/lib /
32+
* @run driver compiler.vectorization.TestPopulateIndex
33+
*/
34+
35+
package compiler.vectorization;
36+
import compiler.lib.ir_framework.*;
37+
import java.util.Random;
38+
39+
public class TestPopulateIndex {
40+
private static final int count = 10000;
41+
42+
private int[] idx;
43+
private int[] src;
44+
private int[] dst;
45+
private float[] f;
46+
47+
public static void main(String args[]) {
48+
TestFramework.run(TestPopulateIndex.class);
49+
}
50+
51+
public TestPopulateIndex() {
52+
idx = new int[count];
53+
src = new int[count];
54+
dst = new int[count];
55+
f = new float[count];
56+
Random ran = new Random(0);
57+
for (int i = 0; i < count; i++) {
58+
src[i] = ran.nextInt();
59+
}
60+
}
61+
62+
@Test
63+
@IR(counts = {"PopulateIndex", ">= 1"})
64+
public void indexArrayFill() {
65+
for (int i = 0; i < count; i++) {
66+
idx[i] = i;
67+
}
68+
checkResultIndexArrayFill();
69+
}
70+
71+
public void checkResultIndexArrayFill() {
72+
for (int i = 0; i < count; i++) {
73+
int expected = i;
74+
if (idx[i] != expected) {
75+
throw new RuntimeException("Invalid result: idx[" + i + "] = " + idx[i] + " != " + expected);
76+
}
77+
}
78+
}
79+
80+
@Test
81+
@IR(counts = {"PopulateIndex", ">= 1"})
82+
public void exprWithIndex1() {
83+
for (int i = 0; i < count; i++) {
84+
dst[i] = src[i] * (i & 7);
85+
}
86+
checkResultExprWithIndex1();
87+
}
88+
89+
public void checkResultExprWithIndex1() {
90+
for (int i = 0; i < count; i++) {
91+
int expected = src[i] * (i & 7);
92+
if (dst[i] != expected) {
93+
throw new RuntimeException("Invalid result: dst[" + i + "] = " + dst[i] + " != " + expected);
94+
}
95+
}
96+
}
97+
98+
@Test
99+
@IR(counts = {"PopulateIndex", ">= 1"})
100+
public void exprWithIndex2() {
101+
for (int i = 0; i < count; i++) {
102+
f[i] = i * i + 100;
103+
}
104+
checkResultExprWithIndex2();
105+
}
106+
107+
public void checkResultExprWithIndex2() {
108+
for (int i = 0; i < count; i++) {
109+
float expected = i * i + 100;
110+
if (f[i] != expected) {
111+
throw new RuntimeException("Invalid result: f[" + i + "] = " + f[i] + " != " + expected);
112+
}
113+
}
114+
}
115+
}

0 commit comments

Comments
 (0)