Skip to content

Commit 8e72d7c

Browse files
Scott GibbonsJatin Bhateja
authored andcommitted
8320448: Accelerate IndexOf using AVX2
Reviewed-by: epeter, kvn, sviswanathan
1 parent 25ad862 commit 8e72d7c

File tree

16 files changed

+3906
-30
lines changed

16 files changed

+3906
-30
lines changed

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 84 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4491,13 +4491,21 @@ void C2_MacroAssembler::count_positives(Register ary1, Register len,
44914491
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
44924492
void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
44934493
Register limit, Register result, Register chr,
4494-
XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4494+
XMMRegister vec1, XMMRegister vec2, bool is_char,
4495+
KRegister mask, bool expand_ary2) {
4496+
// for expand_ary2, limit is the (smaller) size of the second array.
44954497
ShortBranchVerifier sbv(this);
44964498
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
44974499

4500+
assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4501+
"Expansion only implemented for AVX2");
4502+
44984503
int length_offset = arrayOopDesc::length_offset_in_bytes();
44994504
int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
45004505

4506+
Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4507+
int scaleIncr = expand_ary2 ? 8 : 16;
4508+
45014509
if (is_array_equ) {
45024510
// Check the input args
45034511
cmpoop(ary1, ary2);
@@ -4533,14 +4541,20 @@ void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register
45334541

45344542
if (UseAVX >= 2) {
45354543
// With AVX2, use 32-byte vector compare
4536-
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4544+
Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
45374545

45384546
// Compare 32-byte vectors
4539-
andl(result, 0x0000001f); // tail count (in bytes)
4540-
andl(limit, 0xffffffe0); // vector count (in bytes)
4541-
jcc(Assembler::zero, COMPARE_TAIL);
4547+
if (expand_ary2) {
4548+
andl(result, 0x0000000f); // tail count (in bytes)
4549+
andl(limit, 0xfffffff0); // vector count (in bytes)
4550+
jcc(Assembler::zero, COMPARE_TAIL);
4551+
} else {
4552+
andl(result, 0x0000001f); // tail count (in bytes)
4553+
andl(limit, 0xffffffe0); // vector count (in bytes)
4554+
jcc(Assembler::zero, COMPARE_TAIL_16);
4555+
}
45424556

4543-
lea(ary1, Address(ary1, limit, Address::times_1));
4557+
lea(ary1, Address(ary1, limit, scaleFactor));
45444558
lea(ary2, Address(ary2, limit, Address::times_1));
45454559
negptr(limit);
45464560

@@ -4583,25 +4597,59 @@ void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register
45834597
}//if (VM_Version::supports_avx512vlbw())
45844598
#endif //_LP64
45854599
bind(COMPARE_WIDE_VECTORS);
4586-
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4587-
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4600+
vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4601+
if (expand_ary2) {
4602+
vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4603+
} else {
4604+
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4605+
}
45884606
vpxor(vec1, vec2);
45894607

45904608
vptest(vec1, vec1);
45914609
jcc(Assembler::notZero, FALSE_LABEL);
4592-
addptr(limit, 32);
4610+
addptr(limit, scaleIncr * 2);
45934611
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
45944612

45954613
testl(result, result);
45964614
jcc(Assembler::zero, TRUE_LABEL);
45974615

4598-
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4599-
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4616+
vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4617+
if (expand_ary2) {
4618+
vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4619+
} else {
4620+
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4621+
}
46004622
vpxor(vec1, vec2);
46014623

46024624
vptest(vec1, vec1);
4603-
jccb(Assembler::notZero, FALSE_LABEL);
4604-
jmpb(TRUE_LABEL);
4625+
jcc(Assembler::notZero, FALSE_LABEL);
4626+
jmp(TRUE_LABEL);
4627+
4628+
bind(COMPARE_TAIL_16); // limit is zero
4629+
movl(limit, result);
4630+
4631+
// Compare 16-byte chunks
4632+
andl(result, 0x0000000f); // tail count (in bytes)
4633+
andl(limit, 0xfffffff0); // vector count (in bytes)
4634+
jcc(Assembler::zero, COMPARE_TAIL);
4635+
4636+
lea(ary1, Address(ary1, limit, scaleFactor));
4637+
lea(ary2, Address(ary2, limit, Address::times_1));
4638+
negptr(limit);
4639+
4640+
bind(COMPARE_WIDE_VECTORS_16);
4641+
movdqu(vec1, Address(ary1, limit, scaleFactor));
4642+
if (expand_ary2) {
4643+
vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4644+
} else {
4645+
movdqu(vec2, Address(ary2, limit, Address::times_1));
4646+
}
4647+
pxor(vec1, vec2);
4648+
4649+
ptest(vec1, vec1);
4650+
jcc(Assembler::notZero, FALSE_LABEL);
4651+
addptr(limit, scaleIncr);
4652+
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
46054653

46064654
bind(COMPARE_TAIL); // limit is zero
46074655
movl(limit, result);
@@ -4646,19 +4694,34 @@ void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register
46464694
}
46474695

46484696
// Compare 4-byte vectors
4649-
andl(limit, 0xfffffffc); // vector count (in bytes)
4650-
jccb(Assembler::zero, COMPARE_CHAR);
4697+
if (expand_ary2) {
4698+
testl(result, result);
4699+
jccb(Assembler::zero, TRUE_LABEL);
4700+
} else {
4701+
andl(limit, 0xfffffffc); // vector count (in bytes)
4702+
jccb(Assembler::zero, COMPARE_CHAR);
4703+
}
46514704

4652-
lea(ary1, Address(ary1, limit, Address::times_1));
4705+
lea(ary1, Address(ary1, limit, scaleFactor));
46534706
lea(ary2, Address(ary2, limit, Address::times_1));
46544707
negptr(limit);
46554708

46564709
bind(COMPARE_VECTORS);
4657-
movl(chr, Address(ary1, limit, Address::times_1));
4658-
cmpl(chr, Address(ary2, limit, Address::times_1));
4659-
jccb(Assembler::notEqual, FALSE_LABEL);
4660-
addptr(limit, 4);
4661-
jcc(Assembler::notZero, COMPARE_VECTORS);
4710+
if (expand_ary2) {
4711+
// There are no "vector" operations for bytes to shorts
4712+
movzbl(chr, Address(ary2, limit, Address::times_1));
4713+
cmpw(Address(ary1, limit, Address::times_2), chr);
4714+
jccb(Assembler::notEqual, FALSE_LABEL);
4715+
addptr(limit, 1);
4716+
jcc(Assembler::notZero, COMPARE_VECTORS);
4717+
jmp(TRUE_LABEL);
4718+
} else {
4719+
movl(chr, Address(ary1, limit, Address::times_1));
4720+
cmpl(chr, Address(ary2, limit, Address::times_1));
4721+
jccb(Assembler::notEqual, FALSE_LABEL);
4722+
addptr(limit, 4);
4723+
jcc(Assembler::notZero, COMPARE_VECTORS);
4724+
}
46624725

46634726
// Compare trailing char (final 2 bytes), if any
46644727
bind(COMPARE_CHAR);

src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -289,10 +289,11 @@
289289
void count_positives(Register ary1, Register len,
290290
Register result, Register tmp1,
291291
XMMRegister vec1, XMMRegister vec2, KRegister mask1 = knoreg, KRegister mask2 = knoreg);
292+
292293
// Compare char[] or byte[] arrays.
293-
void arrays_equals(bool is_array_equ, Register ary1, Register ary2,
294-
Register limit, Register result, Register chr,
295-
XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg);
294+
void arrays_equals(bool is_array_equ, Register ary1, Register ary2, Register limit,
295+
Register result, Register chr, XMMRegister vec1, XMMRegister vec2,
296+
bool is_char, KRegister mask = knoreg, bool expand_ary2 = false);
296297

297298
void arrays_hashcode(Register str1, Register cnt1, Register result,
298299
Register tmp1, Register tmp2, Register tmp3, XMMRegister vnext,

0 commit comments

Comments
 (0)