-
Notifications
You must be signed in to change notification settings - Fork 6.2k
8286972: Support the new loop induction variable related PopulateIndex IR node on x86 #8778
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8a1b7c5
a21939e
ab07fae
af5ede4
727daec
8c69c7f
1b3d0b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2274,6 +2274,84 @@ void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegist | |
| } | ||
| } | ||
|
|
||
| void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { | ||
| assert(UseAVX >= 2, "required"); | ||
| #ifdef ASSERT | ||
| bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); | ||
| bool is_bw_supported = VM_Version::supports_avx512bw(); | ||
| if (is_bw && !is_bw_supported) { | ||
| assert(vlen_enc != Assembler::AVX_512bit, "required"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are acceptable values of
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For KNL, PopulateIndex support is limited to 256-bit as we need avx512bw() for the 512-bit support.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. |
||
| assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), | ||
| "XMM register should be 0-15"); | ||
| } | ||
|
Comment on lines
+2280
to
+2286
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole block could be under |
||
| #endif // ASSERT | ||
| switch (elem_bt) { | ||
| case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; | ||
| case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; | ||
| case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; | ||
| case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; | ||
| case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; | ||
| case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; | ||
| default: assert(false, "%s", type2name(elem_bt)); | ||
| } | ||
| } | ||
|
|
||
| #ifdef _LP64 | ||
| void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { | ||
| assert(UseAVX >= 2, "required"); | ||
| bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); | ||
| bool is_vl = vlen_enc != Assembler::AVX_512bit; | ||
| if ((UseAVX > 2) && | ||
| (!is_bw || VM_Version::supports_avx512bw()) && | ||
| (!is_vl || VM_Version::supports_avx512vl())) { | ||
| switch (elem_bt) { | ||
| case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; | ||
| case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; | ||
| case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; | ||
| case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; | ||
|
Comment on lines
+2310
to
+2311
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we use single and double precision broadcasts for floating point types, like you have done in else part
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The floating point broadcast doesn't take the gpr as second source.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A prior move as in else part may be emitted for consistency or you want to keep floating point broadcasts only for else part. |
||
| default: assert(false, "%s", type2name(elem_bt)); | ||
| } | ||
| } else { | ||
| assert(vlen_enc != Assembler::AVX_512bit, "required"); | ||
| assert((dst->encoding() < 16),"XMM register should be 0-15"); | ||
|
Comment on lines
+2314
to
+2316
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this part will be executed on KNL CPU. |
||
| switch (elem_bt) { | ||
| case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; | ||
| case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; | ||
| case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; | ||
| case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; | ||
| case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; | ||
| case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; | ||
| default: assert(false, "%s", type2name(elem_bt)); | ||
| } | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
| void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { | ||
| switch (to_elem_bt) { | ||
| case T_SHORT: | ||
| vpmovsxbw(dst, src, vlen_enc); | ||
| break; | ||
| case T_INT: | ||
| vpmovsxbd(dst, src, vlen_enc); | ||
| break; | ||
| case T_FLOAT: | ||
| vpmovsxbd(dst, src, vlen_enc); | ||
| vcvtdq2ps(dst, dst, vlen_enc); | ||
| break; | ||
| case T_LONG: | ||
| vpmovsxbq(dst, src, vlen_enc); | ||
| break; | ||
| case T_DOUBLE: { | ||
| int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; | ||
| vpmovsxbd(dst, src, mid_vlen_enc); | ||
| vcvtdq2pd(dst, dst, vlen_enc); | ||
| break; | ||
| } | ||
| default: assert(false, "%s", type2name(to_elem_bt)); | ||
| } | ||
| } | ||
|
|
||
| //------------------------------------------------------------------------------------------- | ||
|
|
||
| // IndexOf for constant substrings with size >= 8 chars | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1468,6 +1468,11 @@ const bool Matcher::match_rule_supported(int opcode) { | |
| return false; | ||
| } | ||
| break; | ||
| case Op_PopulateIndex: | ||
| if (!is_LP64 || (UseAVX < 2)) { | ||
| return false; | ||
| } | ||
| break; | ||
| case Op_RoundVF: | ||
| if (UseAVX < 2) { // enabled for AVX2 only | ||
| return false; | ||
|
|
@@ -1811,6 +1816,10 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType | |
| return false; // Implementation limitation | ||
| } | ||
| break; | ||
| case Op_PopulateIndex: | ||
| if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) { | ||
| return false; | ||
| } | ||
| case Op_VectorCastB2X: | ||
| case Op_VectorCastS2X: | ||
| case Op_VectorCastI2X: | ||
|
|
@@ -6918,28 +6927,7 @@ instruct vcastBtoX(vec dst, vec src) %{ | |
|
|
||
| BasicType to_elem_bt = Matcher::vector_element_basic_type(this); | ||
| int vlen_enc = vector_length_encoding(this); | ||
| switch (to_elem_bt) { | ||
| case T_SHORT: | ||
| __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); | ||
| break; | ||
| case T_INT: | ||
| __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); | ||
| break; | ||
| case T_FLOAT: | ||
| __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); | ||
| __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); | ||
| break; | ||
| case T_LONG: | ||
| __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); | ||
| break; | ||
| case T_DOUBLE: { | ||
| int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; | ||
| __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc); | ||
| __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); | ||
| break; | ||
| } | ||
| default: assert(false, "%s", type2name(to_elem_bt)); | ||
| } | ||
| __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc); | ||
| %} | ||
| ins_pipe( pipe_slow ); | ||
| %} | ||
|
|
@@ -8272,6 +8260,45 @@ instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{ | |
| ins_pipe( pipe_slow ); | ||
| %} | ||
|
|
||
| #ifdef _LP64 | ||
| instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp, rRegP scratch) %{ | ||
| match(Set dst (PopulateIndex src1 src2)); | ||
| effect(TEMP dst, TEMP vtmp, TEMP scratch); | ||
| format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp and $scratch as TEMP" %} | ||
| ins_encode %{ | ||
| assert($src2$$constant == 1, "required"); | ||
| int vlen = Matcher::vector_length(this); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May I ask why use |
||
| int vlen_enc = vector_length_encoding(this); | ||
| BasicType elem_bt = Matcher::vector_element_basic_type(this); | ||
| __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc); | ||
| __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen); | ||
| if (elem_bt != T_BYTE) { | ||
| __ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); | ||
| } | ||
| __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); | ||
| %} | ||
| ins_pipe( pipe_slow ); | ||
| %} | ||
|
|
||
| instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp, rRegP scratch) %{ | ||
| match(Set dst (PopulateIndex src1 src2)); | ||
| effect(TEMP dst, TEMP vtmp, TEMP scratch); | ||
| format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp and $scratch as TEMP" %} | ||
| ins_encode %{ | ||
| assert($src2$$constant == 1, "required"); | ||
| int vlen = Matcher::vector_length(this); | ||
| int vlen_enc = vector_length_encoding(this); | ||
| BasicType elem_bt = Matcher::vector_element_basic_type(this); | ||
| __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc); | ||
| __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen); | ||
| if (elem_bt != T_BYTE) { | ||
| __ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); | ||
| } | ||
| __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); | ||
| %} | ||
| ins_pipe( pipe_slow ); | ||
| %} | ||
| #endif | ||
| //-------------------------------- Rearrange ---------------------------------- | ||
|
|
||
| // LoadShuffle/Rearrange for Byte | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| /* | ||
| * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. | ||
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | ||
| * | ||
| * This code is free software; you can redistribute it and/or modify it | ||
| * under the terms of the GNU General Public License version 2 only, as | ||
| * published by the Free Software Foundation. | ||
| * | ||
| * This code is distributed in the hope that it will be useful, but WITHOUT | ||
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
| * version 2 for more details (a copy is included in the LICENSE file that | ||
| * accompanied this code). | ||
| * | ||
| * You should have received a copy of the GNU General Public License version | ||
| * 2 along with this work; if not, write to the Free Software Foundation, | ||
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | ||
| * | ||
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | ||
| * or visit www.oracle.com if you need additional information or have any | ||
| * questions. | ||
| */ | ||
|
|
||
| /** | ||
| * @test | ||
| * @bug 8286972 | ||
| * @summary Test vectorization of loop induction variable usage in the loop | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR id missing. |
||
| * @requires vm.compiler2.enabled | ||
| * @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx2.*") | | ||
| * (os.simpleArch == "aarch64" & vm.cpu.features ~= ".*sve.*") | ||
| * @library /test/lib / | ||
| * @run driver compiler.vectorization.TestPopulateIndex | ||
| */ | ||
|
|
||
| package compiler.vectorization; | ||
| import compiler.lib.ir_framework.*; | ||
| import java.util.Random; | ||
|
|
||
| public class TestPopulateIndex { | ||
| private static final int count = 10000; | ||
|
|
||
| private int[] idx; | ||
| private int[] src; | ||
| private int[] dst; | ||
| private float[] f; | ||
|
|
||
| public static void main(String args[]) { | ||
| TestFramework.run(TestPopulateIndex.class); | ||
| } | ||
|
|
||
| public TestPopulateIndex() { | ||
| idx = new int[count]; | ||
| src = new int[count]; | ||
| dst = new int[count]; | ||
| f = new float[count]; | ||
| Random ran = new Random(0); | ||
| for (int i = 0; i < count; i++) { | ||
| src[i] = ran.nextInt(); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| @IR(counts = {"PopulateIndex", ">= 1"}) | ||
| public void indexArrayFill() { | ||
| for (int i = 0; i < count; i++) { | ||
| idx[i] = i; | ||
| } | ||
| checkResultIndexArrayFill(); | ||
| } | ||
|
|
||
| public void checkResultIndexArrayFill() { | ||
| for (int i = 0; i < count; i++) { | ||
| int expected = i; | ||
| if (idx[i] != expected) { | ||
| throw new RuntimeException("Invalid result: idx[" + i + "] = " + idx[i] + " != " + expected); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| @IR(counts = {"PopulateIndex", ">= 1"}) | ||
| public void exprWithIndex1() { | ||
| for (int i = 0; i < count; i++) { | ||
| dst[i] = src[i] * (i & 7); | ||
| } | ||
| checkResultExprWithIndex1(); | ||
| } | ||
|
|
||
| public void checkResultExprWithIndex1() { | ||
| for (int i = 0; i < count; i++) { | ||
| int expected = src[i] * (i & 7); | ||
| if (dst[i] != expected) { | ||
| throw new RuntimeException("Invalid result: dst[" + i + "] = " + dst[i] + " != " + expected); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| @IR(counts = {"PopulateIndex", ">= 1"}) | ||
| public void exprWithIndex2() { | ||
| for (int i = 0; i < count; i++) { | ||
| f[i] = i * i + 100; | ||
| } | ||
| checkResultExprWithIndex2(); | ||
| } | ||
|
|
||
| public void checkResultExprWithIndex2() { | ||
| for (int i = 0; i < count; i++) { | ||
| float expected = i * i + 100; | ||
| if (f[i] != expected) { | ||
| throw new RuntimeException("Invalid result: f[" + i + "] = " + f[i] + " != " + expected); | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not include this line in #ifdef ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does not matter since it is assert which add code only in debug VM. I like this way.