Skip to content

Commit 2de8d58

Browse files
erifanJatin Bhateja
authored andcommitted
8366333: AArch64: Enhance SVE subword type implementation of vector compress
Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org> Reviewed-by: jbhateja, xgong, galder, vlivanov
1 parent 0522cf2 commit 2de8d58

File tree

10 files changed

+410
-117
lines changed

10 files changed

+410
-117
lines changed

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7081,29 +7081,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
70817081
%}
70827082

70837083
instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
7084-
vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{
7084+
vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
70857085
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
7086-
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp);
7086+
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
70877087
match(Set dst (CompressV src pg));
7088-
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %}
7088+
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
70897089
ins_encode %{
7090+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
70907091
__ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
7091-
$tmp1$$FloatRegister,$tmp2$$FloatRegister,
7092-
$tmp3$$FloatRegister,$tmp4$$FloatRegister,
7093-
$ptmp$$PRegister, $pgtmp$$PRegister);
7092+
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
7093+
$ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
70947094
%}
70957095
ins_pipe(pipe_slow);
70967096
%}
70977097

7098-
instruct vcompressS(vReg dst, vReg src, pReg pg,
7099-
vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
7098+
instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
71007099
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
71017100
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
71027101
match(Set dst (CompressV src pg));
71037102
format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
71047103
ins_encode %{
7104+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
7105+
__ sve_dup($tmp1$$FloatRegister, __ H, 0);
71057106
__ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
7106-
$tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister);
7107+
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
7108+
length_in_bytes);
71077109
%}
71087110
ins_pipe(pipe_slow);
71097111
%}

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5069,29 +5069,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
50695069
%}
50705070

50715071
instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
5072-
vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{
5072+
vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
50735073
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
5074-
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp);
5074+
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
50755075
match(Set dst (CompressV src pg));
5076-
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %}
5076+
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
50775077
ins_encode %{
5078+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
50785079
__ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
5079-
$tmp1$$FloatRegister,$tmp2$$FloatRegister,
5080-
$tmp3$$FloatRegister,$tmp4$$FloatRegister,
5081-
$ptmp$$PRegister, $pgtmp$$PRegister);
5080+
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
5081+
$ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
50825082
%}
50835083
ins_pipe(pipe_slow);
50845084
%}
50855085

5086-
instruct vcompressS(vReg dst, vReg src, pReg pg,
5087-
vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
5086+
instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
50885087
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
50895088
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
50905089
match(Set dst (CompressV src pg));
50915090
format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
50925091
ins_encode %{
5092+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
5093+
__ sve_dup($tmp1$$FloatRegister, __ H, 0);
50935094
__ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
5094-
$tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister);
5095+
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
5096+
length_in_bytes);
50955097
%}
50965098
ins_pipe(pipe_slow);
50975099
%}

src/hotspot/cpu/aarch64/assembler_aarch64.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3486,6 +3486,7 @@ template<typename R, typename... Rx>
34863486
INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar
34873487
INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors
34883488
INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar
3489+
INSN(sve_splice,0b00000101, 0b101100100); // splice two vectors under predicate control, destructive
34893490
INSN(sve_sub, 0b00000100, 0b000001000); // vector sub
34903491
INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar
34913492
INSN(sve_umax, 0b00000100, 0b001001000); // unsigned maximum vectors

src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

Lines changed: 76 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2203,114 +2203,117 @@ void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t l
22032203
// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
22042204
// Any remaining elements of dst will be filled with zero.
22052205
// Clobbers: rscratch1
2206-
// Preserves: src, mask
2206+
// Preserves: mask, vzr
22072207
void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2208-
FloatRegister vtmp1, FloatRegister vtmp2,
2209-
PRegister pgtmp) {
2208+
FloatRegister vzr, FloatRegister vtmp,
2209+
PRegister pgtmp, unsigned vector_length_in_bytes) {
22102210
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2211-
assert_different_registers(dst, src, vtmp1, vtmp2);
2211+
// When called by sve_compress_byte, src and vtmp may be the same register.
2212+
assert_different_registers(dst, src, vzr);
2213+
assert_different_registers(dst, vtmp, vzr);
22122214
assert_different_registers(mask, pgtmp);
2213-
2214-
// Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
2215-
// mask = 0001 0000 0000 0001 0001 0000 0001 0001
2216-
// Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
2217-
sve_dup(vtmp2, H, 0);
2215+
// high <-- low
2216+
// Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2217+
// mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2218+
// Expected result: dst = 00 00 00 hh ee dd bb aa
22182219

22192220
// Extend lowest half to type INT.
2220-
// dst = 00004444 00003333 00002222 00001111
2221+
// dst = 00dd 00cc 00bb 00aa
22212222
sve_uunpklo(dst, S, src);
2222-
// pgtmp = 00000001 00000000 00000001 00000001
2223+
// pgtmp = 0001 0000 0001 0001
22232224
sve_punpklo(pgtmp, mask);
22242225
// Pack the active elements in size of type INT to the right,
22252226
// and fill the remainings with zero.
2226-
// dst = 00000000 00004444 00002222 00001111
2227+
// dst = 0000 00dd 00bb 00aa
22272228
sve_compact(dst, S, dst, pgtmp);
22282229
// Narrow the result back to type SHORT.
2229-
// dst = 0000 0000 0000 0000 0000 4444 2222 1111
2230-
sve_uzp1(dst, H, dst, vtmp2);
2230+
// dst = 00 00 00 00 00 dd bb aa
2231+
sve_uzp1(dst, H, dst, vzr);
2232+
2233+
// Return if the vector length is no more than MaxVectorSize/2, since the
2234+
// highest half is invalid.
2235+
if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2236+
return;
2237+
}
2238+
22312239
// Count the active elements of lowest half.
22322240
// rscratch1 = 3
22332241
sve_cntp(rscratch1, S, ptrue, pgtmp);
22342242

22352243
// Repeat to the highest half.
2236-
// pgtmp = 00000001 00000000 00000000 00000001
2244+
// pgtmp = 0001 0000 0000 0001
22372245
sve_punpkhi(pgtmp, mask);
2238-
// vtmp1 = 00008888 00007777 00006666 00005555
2239-
sve_uunpkhi(vtmp1, S, src);
2240-
// vtmp1 = 00000000 00000000 00008888 00005555
2241-
sve_compact(vtmp1, S, vtmp1, pgtmp);
2242-
// vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2243-
sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2244-
2245-
// Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
2246-
// Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2247-
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
2248-
// TRUE_CNT is the number of active elements in the compressed low.
2249-
neg(rscratch1, rscratch1);
2250-
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
2251-
sve_index(vtmp2, H, rscratch1, 1);
2252-
// vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2253-
sve_tbl(vtmp1, H, vtmp1, vtmp2);
2254-
2255-
// Combine the compressed high(after shifted) with the compressed low.
2256-
// dst = 0000 0000 0000 8888 5555 4444 2222 1111
2257-
sve_orr(dst, dst, vtmp1);
2246+
// vtmp = 00hh 00gg 00ff 00ee
2247+
sve_uunpkhi(vtmp, S, src);
2248+
// vtmp = 0000 0000 00hh 00ee
2249+
sve_compact(vtmp, S, vtmp, pgtmp);
2250+
// vtmp = 00 00 00 00 00 00 hh ee
2251+
sve_uzp1(vtmp, H, vtmp, vzr);
2252+
2253+
// pgtmp = 00 00 00 00 00 01 01 01
2254+
sve_whilelt(pgtmp, H, zr, rscratch1);
2255+
// Compressed low: dst = 00 00 00 00 00 dd bb aa
2256+
// Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2257+
// Combine the compressed low with the compressed high:
2258+
// dst = 00 00 00 hh ee dd bb aa
2259+
sve_splice(dst, H, pgtmp, vtmp);
22582260
}
22592261

22602262
// Clobbers: rscratch1, rscratch2
22612263
// Preserves: src, mask
22622264
void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2263-
FloatRegister vtmp1, FloatRegister vtmp2,
2264-
FloatRegister vtmp3, FloatRegister vtmp4,
2265-
PRegister ptmp, PRegister pgtmp) {
2265+
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2266+
PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
22662267
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2267-
assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2268+
assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
22682269
assert_different_registers(mask, ptmp, pgtmp);
2269-
// Example input: src = 88 77 66 55 44 33 22 11
2270-
// mask = 01 00 00 01 01 00 01 01
2271-
// Expected result: dst = 00 00 00 88 55 44 22 11
2270+
// high <-- low
2271+
// Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2272+
// mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2273+
// Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2274+
FloatRegister vzr = vtmp3;
2275+
sve_dup(vzr, B, 0);
22722276

2273-
sve_dup(vtmp4, B, 0);
22742277
// Extend lowest half to type SHORT.
2275-
// vtmp1 = 0044 0033 0022 0011
2278+
// vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
22762279
sve_uunpklo(vtmp1, H, src);
2277-
// ptmp = 0001 0000 0001 0001
2280+
// ptmp = 00 01 00 00 00 01 00 01
22782281
sve_punpklo(ptmp, mask);
2279-
// Count the active elements of lowest half.
2280-
// rscratch2 = 3
2281-
sve_cntp(rscratch2, H, ptrue, ptmp);
22822282
// Pack the active elements in size of type SHORT to the right,
22832283
// and fill the remainings with zero.
2284-
// dst = 0000 0044 0022 0011
2285-
sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2284+
// dst = 00 00 00 00 00 0g 0c 0a
2285+
unsigned extended_size = vector_length_in_bytes << 1;
2286+
sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
22862287
// Narrow the result back to type BYTE.
2287-
// dst = 00 00 00 00 00 44 22 11
2288-
sve_uzp1(dst, B, dst, vtmp4);
2288+
// dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2289+
sve_uzp1(dst, B, dst, vzr);
2290+
2291+
// Return if the vector length is no more than MaxVectorSize/2, since the
2292+
// highest half is invalid.
2293+
if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2294+
return;
2295+
}
2296+
// Count the active elements of lowest half.
2297+
// rscratch2 = 3
2298+
sve_cntp(rscratch2, H, ptrue, ptmp);
22892299

22902300
// Repeat to the highest half.
2291-
// ptmp = 0001 0000 0000 0001
2301+
// ptmp = 00 01 00 00 00 00 00 01
22922302
sve_punpkhi(ptmp, mask);
2293-
// vtmp1 = 0088 0077 0066 0055
2303+
// vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
22942304
sve_uunpkhi(vtmp2, H, src);
2295-
// vtmp1 = 0000 0000 0088 0055
2296-
sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2297-
2298-
sve_dup(vtmp4, B, 0);
2299-
// vtmp1 = 00 00 00 00 00 00 88 55
2300-
sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2301-
2302-
// Compressed low: dst = 00 00 00 00 00 44 22 11
2303-
// Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
2304-
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
2305-
// TRUE_CNT is the number of active elements in the compressed low.
2306-
neg(rscratch2, rscratch2);
2307-
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
2308-
sve_index(vtmp2, B, rscratch2, 1);
2309-
// vtmp1 = 00 00 00 88 55 00 00 00
2310-
sve_tbl(vtmp1, B, vtmp1, vtmp2);
2311-
// Combine the compressed high(after shifted) with the compressed low.
2312-
// dst = 00 00 00 88 55 44 22 11
2313-
sve_orr(dst, dst, vtmp1);
2305+
// vtmp1 = 00 00 00 00 00 00 0p 0i
2306+
sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2307+
// vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2308+
sve_uzp1(vtmp1, B, vtmp1, vzr);
2309+
2310+
// ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2311+
sve_whilelt(ptmp, B, zr, rscratch2);
2312+
// Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2313+
// Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2314+
// Combine the compressed low with the compressed high:
2315+
// dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2316+
sve_splice(dst, B, ptmp, vtmp1);
23142317
}
23152318

23162319
void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {

src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,12 @@
173173
// lowest-numbered elements of dst. Any remaining elements of dst will
174174
// be filled with zero.
175175
void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
176-
FloatRegister vtmp1, FloatRegister vtmp2,
177-
FloatRegister vtmp3, FloatRegister vtmp4,
178-
PRegister ptmp, PRegister pgtmp);
176+
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
177+
PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes);
179178

180179
void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
181-
FloatRegister vtmp1, FloatRegister vtmp2,
182-
PRegister pgtmp);
180+
FloatRegister vzr, FloatRegister vtmp,
181+
PRegister pgtmp, unsigned vector_length_in_bytes);
183182

184183
void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
185184

test/hotspot/gtest/aarch64/aarch64-asmtest.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2143,6 +2143,10 @@ def generate(kind, names):
21432143
["facge", "__ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5);", "facge\tp1.h, p2/z, z4.h, z5.h"],
21442144
["facge", "__ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5);", "facge\tp1.s, p2/z, z4.s, z5.s"],
21452145
["facge", "__ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5);", "facge\tp1.d, p2/z, z4.d, z5.d"],
2146+
["splice", "__ sve_splice(z0, __ B, p0, z1);", "splice\tz0.b, p0, z0.b, z1.b"],
2147+
["splice", "__ sve_splice(z0, __ H, p0, z1);", "splice\tz0.h, p0, z0.h, z1.h"],
2148+
["splice", "__ sve_splice(z0, __ S, p0, z1);", "splice\tz0.s, p0, z0.s, z1.s"],
2149+
["splice", "__ sve_splice(z0, __ D, p0, z1);", "splice\tz0.d, p0, z0.d, z1.d"],
21462150
# SVE2 instructions
21472151
["histcnt", "__ sve_histcnt(z16, __ S, p0, z16, z16);", "histcnt\tz16.s, p0/z, z16.s, z16.s"],
21482152
["histcnt", "__ sve_histcnt(z17, __ D, p0, z17, z17);", "histcnt\tz17.d, p0/z, z17.d, z17.d"],

0 commit comments

Comments
 (0)