Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2274,6 +2274,84 @@ void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegist
}
}

void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
assert(UseAVX >= 2, "required");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not include this line in #ifdef ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not matter since it is assert which add code only in debug VM. I like this way.

#ifdef ASSERT
bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
bool is_bw_supported = VM_Version::supports_avx512bw();
if (is_bw && !is_bw_supported) {
assert(vlen_enc != Assembler::AVX_512bit, "required");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are acceptable values of vlen_enc?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For KNL, PopulateIndex support is limited to 256-bit as we need avx512bw() for the 512-bit support.
For other AVX2 and AVX512 architectures, all vector widths up to and including 512-bit are supported.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay.

assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
"XMM register should be 0-15");
}
Comment on lines +2280 to +2286
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole block could be under #ifdef ASSERT.

#endif // ASSERT
switch (elem_bt) {
case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
default: assert(false, "%s", type2name(elem_bt));
}
}

#ifdef _LP64
void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
assert(UseAVX >= 2, "required");
bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
bool is_vl = vlen_enc != Assembler::AVX_512bit;
if ((UseAVX > 2) &&
(!is_bw || VM_Version::supports_avx512bw()) &&
(!is_vl || VM_Version::supports_avx512vl())) {
switch (elem_bt) {
case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
Comment on lines +2310 to +2311
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use single and double precision broadcasts for floating point types, like you have done in else part
It may save domain switch over penalty (Section 3.5.2.2 Bypass between Execution Domains, Intel® 64 and IA-32 Architectures Optimization Reference Manual)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The floating point broadcast doesn't take the gpr as second source.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A prior move as in else part may be emitted for consistency or you want to keep floating point broadcasts only for else part.

default: assert(false, "%s", type2name(elem_bt));
}
} else {
assert(vlen_enc != Assembler::AVX_512bit, "required");
assert((dst->encoding() < 16),"XMM register should be 0-15");
Comment on lines +2314 to +2316
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The } else { case will be also executed on on KNL CPU. Did you tested with -XX:+UseKNLSetting?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this part will be executed on KNL CPU.
I did run the compiler tests with UseKNLSetting and didn't see any issue.

switch (elem_bt) {
case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
default: assert(false, "%s", type2name(elem_bt));
}
}
}
#endif

void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
switch (to_elem_bt) {
case T_SHORT:
vpmovsxbw(dst, src, vlen_enc);
break;
case T_INT:
vpmovsxbd(dst, src, vlen_enc);
break;
case T_FLOAT:
vpmovsxbd(dst, src, vlen_enc);
vcvtdq2ps(dst, dst, vlen_enc);
break;
case T_LONG:
vpmovsxbq(dst, src, vlen_enc);
break;
case T_DOUBLE: {
int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
vpmovsxbd(dst, src, mid_vlen_enc);
vcvtdq2pd(dst, dst, vlen_enc);
break;
}
default: assert(false, "%s", type2name(to_elem_bt));
}
}

//-------------------------------------------------------------------------------------------

// IndexOf for constant substrings with size >= 8 chars
Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,13 @@
void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg);

// Covert B2X
void vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc);
#ifdef _LP64
void vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc);
#endif
void vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);

// blend
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len);
Expand Down
71 changes: 49 additions & 22 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1468,6 +1468,11 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_PopulateIndex:
if (!is_LP64 || (UseAVX < 2)) {
return false;
}
break;
case Op_RoundVF:
if (UseAVX < 2) { // enabled for AVX2 only
return false;
Expand Down Expand Up @@ -1811,6 +1816,10 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false; // Implementation limitation
}
break;
case Op_PopulateIndex:
if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
return false;
}
case Op_VectorCastB2X:
case Op_VectorCastS2X:
case Op_VectorCastI2X:
Expand Down Expand Up @@ -6918,28 +6927,7 @@ instruct vcastBtoX(vec dst, vec src) %{

BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
int vlen_enc = vector_length_encoding(this);
switch (to_elem_bt) {
case T_SHORT:
__ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
break;
case T_INT:
__ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
break;
case T_FLOAT:
__ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
break;
case T_LONG:
__ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
break;
case T_DOUBLE: {
int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
__ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
__ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
break;
}
default: assert(false, "%s", type2name(to_elem_bt));
}
__ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
Expand Down Expand Up @@ -8272,6 +8260,45 @@ instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
ins_pipe( pipe_slow );
%}

#ifdef _LP64
instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp, rRegP scratch) %{
match(Set dst (PopulateIndex src1 src2));
effect(TEMP dst, TEMP vtmp, TEMP scratch);
format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp and $scratch as TEMP" %}
ins_encode %{
assert($src2$$constant == 1, "required");
int vlen = Matcher::vector_length(this);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May I ask why use Matcher::vector_length() here, rather than Matcher::vector_length_in_bytes(), for load_iota_indices()? Thanks.

int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
__ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen);
if (elem_bt != T_BYTE) {
__ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
}
__ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp, rRegP scratch) %{
match(Set dst (PopulateIndex src1 src2));
effect(TEMP dst, TEMP vtmp, TEMP scratch);
format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp and $scratch as TEMP" %}
ins_encode %{
assert($src2$$constant == 1, "required");
int vlen = Matcher::vector_length(this);
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
__ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen);
if (elem_bt != T_BYTE) {
__ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
}
__ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#endif
//-------------------------------- Rearrange ----------------------------------

// LoadShuffle/Rearrange for Byte
Expand Down
115 changes: 115 additions & 0 deletions test/hotspot/jtreg/compiler/vectorization/TestPopulateIndex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/**
* @test
* @bug 8286972
* @summary Test vectorization of loop induction variable usage in the loop
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR id missing.

* @requires vm.compiler2.enabled
* @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx2.*") |
* (os.simpleArch == "aarch64" & vm.cpu.features ~= ".*sve.*")
* @library /test/lib /
* @run driver compiler.vectorization.TestPopulateIndex
*/

package compiler.vectorization;
import compiler.lib.ir_framework.*;
import java.util.Random;

public class TestPopulateIndex {
private static final int count = 10000;

private int[] idx;
private int[] src;
private int[] dst;
private float[] f;

public static void main(String args[]) {
TestFramework.run(TestPopulateIndex.class);
}

public TestPopulateIndex() {
idx = new int[count];
src = new int[count];
dst = new int[count];
f = new float[count];
Random ran = new Random(0);
for (int i = 0; i < count; i++) {
src[i] = ran.nextInt();
}
}

@Test
@IR(counts = {"PopulateIndex", ">= 1"})
public void indexArrayFill() {
for (int i = 0; i < count; i++) {
idx[i] = i;
}
checkResultIndexArrayFill();
}

public void checkResultIndexArrayFill() {
for (int i = 0; i < count; i++) {
int expected = i;
if (idx[i] != expected) {
throw new RuntimeException("Invalid result: idx[" + i + "] = " + idx[i] + " != " + expected);
}
}
}

@Test
@IR(counts = {"PopulateIndex", ">= 1"})
public void exprWithIndex1() {
for (int i = 0; i < count; i++) {
dst[i] = src[i] * (i & 7);
}
checkResultExprWithIndex1();
}

public void checkResultExprWithIndex1() {
for (int i = 0; i < count; i++) {
int expected = src[i] * (i & 7);
if (dst[i] != expected) {
throw new RuntimeException("Invalid result: dst[" + i + "] = " + dst[i] + " != " + expected);
}
}
}

@Test
@IR(counts = {"PopulateIndex", ">= 1"})
public void exprWithIndex2() {
for (int i = 0; i < count; i++) {
f[i] = i * i + 100;
}
checkResultExprWithIndex2();
}

public void checkResultExprWithIndex2() {
for (int i = 0; i < count; i++) {
float expected = i * i + 100;
if (f[i] != expected) {
throw new RuntimeException("Invalid result: f[" + i + "] = " + f[i] + " != " + expected);
}
}
}
}