Skip to content

Commit

Permalink
Review comments resolutions.
Browse files Browse the repository at this point in the history
  • Loading branch information
Jatin Bhateja committed Jan 21, 2024
1 parent de47076 commit 9ed6b50
Showing 1 changed file with 45 additions and 42 deletions.
87 changes: 45 additions & 42 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1584,50 +1584,49 @@ void C2_MacroAssembler::vpackI2X(BasicType elem_bt, XMMRegister dst,
void C2_MacroAssembler::vgather_subword_avx3(BasicType elem_bt, XMMRegister dst, Register base, XMMRegister offset,
Register idx_base, int idx_off, XMMRegister idx_vec, XMMRegister ones,
XMMRegister xtmp, KRegister gmask, int vlen_enc) {
int mask_shift = -1;
int bitlevel_offset_shift = -1;
int nomlarized_index_shift = -1;
Address::ScaleFactor scale = Address::no_scale;

if (elem_bt == T_SHORT) {
evmovdquq(idx_vec, Address(idx_base, idx_off, Address::times_4), vlen_enc);
if (offset != xnoreg) {
vpaddd(idx_vec, idx_vec, offset, vlen_enc);
}
// Normalize the indices to multiple of 2.
vpslld(xtmp, ones, 1, vlen_enc);
vpand(xtmp, idx_vec, xtmp, vlen_enc);
// Load double words from normalized indices.
evpgatherdd(dst, gmask, Address(base, xtmp, Address::times_2), vlen_enc);
// Compute bit level offset of actual short value with in each double word
// lane.
vpsrld(xtmp, ones, 31, vlen_enc);
vpand(xtmp, idx_vec, xtmp, vlen_enc);
vpslld(xtmp, xtmp, 4, vlen_enc);
// Move the short value at respective bit offset to lower 16 bits of each
// double word lane.
vpsrlvd(dst, dst, xtmp, vlen_enc);
// Pack double word vector into short vector.
vpackI2X(T_SHORT, dst, ones, xtmp, vlen_enc);
mask_shift = 31;
scale = Address::times_2;
bitlevel_offset_shift = 4;
nomlarized_index_shift = 1;
} else {
assert(elem_bt == T_BYTE, "");
evmovdquq(idx_vec, Address(idx_base, idx_off, Address::times_4), vlen_enc);
if (offset != xnoreg) {
vpaddd(idx_vec, idx_vec, offset, vlen_enc);
}
// Normalize the indices to multiple of 4.
vpslld(xtmp, ones, 2, vlen_enc);
vpand(xtmp, idx_vec, xtmp, vlen_enc);
// Load double words from normalized indices.
evpgatherdd(dst, gmask, Address(base, xtmp, Address::times_1), vlen_enc);
// Compute bit level offset of actual byte value with in each double word
// lane.
vpsrld(xtmp, ones, 30, vlen_enc);
vpand(xtmp, idx_vec, xtmp, vlen_enc);
vpslld(xtmp, xtmp, 3, vlen_enc);
// Move the byte value at respective bit offset to lower 8 bits of each
// double word lane.
vpsrlvd(dst, dst, xtmp, vlen_enc);
// Pack double word vector into byte vector.
vpackI2X(T_BYTE, dst, ones, xtmp, vlen_enc);
}
}

mask_shift = 30;
scale = Address::times_1;
bitlevel_offset_shift = 3;
nomlarized_index_shift = 2;
}

evmovdquq(idx_vec, Address(idx_base, idx_off, Address::times_4), vlen_enc);
if (offset != xnoreg) {
vpaddd(idx_vec, idx_vec, offset, vlen_enc);
}
// Normalize the indices to multiple of nomlarized_index_shift.
vpslld(xtmp, ones, nomlarized_index_shift, vlen_enc);
vpand(xtmp, idx_vec, xtmp, vlen_enc);
// Load double words from normalized indices.
evpgatherdd(dst, gmask, Address(base, xtmp, scale), vlen_enc);
// Compute bit level offset of actual sub-word within each double word
// lane.
vpsrld(xtmp, ones, mask_shift, vlen_enc);
vpand(xtmp, idx_vec, xtmp, vlen_enc);
vpslld(xtmp, xtmp, bitlevel_offset_shift, vlen_enc);
// Move the sub-word value at respective bit offset to lower
// 16 bits(for short)/8 bits(for byte) of each double word lane.
vpsrlvd(dst, dst, xtmp, vlen_enc);
// Pack double word vector into short vector.
vpackI2X(elem_bt, dst, ones, xtmp, vlen_enc);
}

// Gather sub-words from gather indices using integral gather instructions,
// because of the lane size mismatch b/w int and sub-words, algorithm makes multiple
// calls to leaf level routine(vgather_subword_avx3) which performs actual gather
// operation.
void C2_MacroAssembler::vgather_subword_masked_avx3(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
Register offset, XMMRegister offset_vec, XMMRegister idx_vec,
XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, KRegister mask,
Expand All @@ -1648,7 +1647,7 @@ void C2_MacroAssembler::vgather_subword_masked_avx3(BasicType elem_bt, XMMRegist
// Loop to gather 8(64bit), 16(128bit), 32(256bit) or 64(512bit) bytes from
// memory into vector using integral gather instructions. Number of loop
// iterations depends on the maximum integral vector size supported by
// target capped by the gather count i.e. in order to gather 8 bytes over
// target, capped by the gather count i.e. in order to gather 8 bytes over
// AVX-512 targets we need to use 256bit integer gather even though target
// supports 512 bit integral gather operation.
for (int i = 0, j = 0; i < lane_count_subwords; i += lane_count_ints, j++) {
Expand Down Expand Up @@ -1753,12 +1752,14 @@ void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
vpxor(dst, dst, dst, vlen_enc);
if (elem_bt == T_SHORT) {
for (int i = 0; i < 4; i++) {
// dst[i] = src[idx_base[i]]
movl(rtmp, Address(idx_base, i * 4));
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
}
} else {
assert(elem_bt == T_BYTE, "");
for (int i = 0; i < 8; i++) {
// dst[i] = src[idx_base[i]]
movl(rtmp, Address(idx_base, i * 4));
pinsrb(dst, Address(base, rtmp), i);
}
Expand All @@ -1772,13 +1773,15 @@ void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
vpxor(dst, dst, dst, vlen_enc);
if (elem_bt == T_SHORT) {
for (int i = 0; i < 4; i++) {
// dst[i] = src[idx_base[i] + offset]
movl(rtmp, Address(idx_base, i * 4));
addl(rtmp, offset);
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
}
} else {
assert(elem_bt == T_BYTE, "");
for (int i = 0; i < 8; i++) {
// dst[i] = src[idx_base[i] + offset]
movl(rtmp, Address(idx_base, i * 4));
addl(rtmp, offset);
pinsrb(dst, Address(base, rtmp), i);
Expand Down

0 comments on commit 9ed6b50

Please sign in to comment.