Skip to content

Commit

Permalink
8305563: [vectorapi]: Initial aarch64 backend implementation for FP16…
Browse files Browse the repository at this point in the history
… operations

Reviewed-by: aph, xgong
  • Loading branch information
Bhavana Kilambi authored and Xiaohong Gong committed Jun 30, 2023
1 parent d0d24f3 commit 1317dcd
Show file tree
Hide file tree
Showing 12 changed files with 1,183 additions and 783 deletions.
252 changes: 248 additions & 4 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,20 @@ source %{
return false;
}
break;
case Op_AddVHF:
case Op_SubVHF:
case Op_MulVHF:
case Op_DivVHF:
case Op_AbsVHF:
case Op_NegVHF:
case Op_FmaVHF:
case Op_AddReductionVHF:
// FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
// Only the Neon instructions need this check. SVE supports 16-bit floats by default.
if (UseSVE > 0 || (VM_Version::supports_fphp() && VM_Version::supports_asimdhp())) {
break;
}
return false;
default:
break;
}
Expand Down Expand Up @@ -282,6 +296,7 @@ source %{
case Op_VectorMaskCmp:
case Op_LoadVectorGather:
case Op_StoreVectorScatter:
case Op_AddReductionVHF:
case Op_AddReductionVF:
case Op_AddReductionVD:
case Op_AndReductionV:
Expand Down Expand Up @@ -572,6 +587,22 @@ instruct vaddL(vReg dst, vReg src1, vReg src2) %{
ins_pipe(pipe_slow);
%}

instruct vaddHF(vReg dst, vReg src1, vReg src2) %{
match(Set dst (AddVHF src1 src2));
format %{ "vaddHF $dst, $src1, $src2" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ fadd($dst$$FloatRegister, get_arrangement(this),
$src1$$FloatRegister, $src2$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_fadd($dst$$FloatRegister, __ H, $src1$$FloatRegister, $src2$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct vaddF(vReg dst, vReg src1, vReg src2) %{
match(Set dst (AddVF src1 src2));
format %{ "vaddF $dst, $src1, $src2" %}
Expand Down Expand Up @@ -646,6 +677,16 @@ instruct vaddL_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vaddHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (AddVHF (Binary dst_src1 src2) pg));
format %{ "vaddHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
ins_encode %{
__ sve_fadd($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vaddF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (AddVF (Binary dst_src1 src2) pg));
Expand Down Expand Up @@ -796,6 +837,22 @@ instruct vsubL(vReg dst, vReg src1, vReg src2) %{
ins_pipe(pipe_slow);
%}

instruct vsubHF(vReg dst, vReg src1, vReg src2) %{
match(Set dst (SubVHF src1 src2));
format %{ "vsubHF $dst, $src1, $src2" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ fsub($dst$$FloatRegister, get_arrangement(this),
$src1$$FloatRegister, $src2$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_fsub($dst$$FloatRegister, __ H, $src1$$FloatRegister, $src2$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct vsubF(vReg dst, vReg src1, vReg src2) %{
match(Set dst (SubVF src1 src2));
format %{ "vsubF $dst, $src1, $src2" %}
Expand Down Expand Up @@ -870,6 +927,16 @@ instruct vsubL_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vsubHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (SubVHF (Binary dst_src1 src2) pg));
format %{ "vsubHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
ins_encode %{
__ sve_fsub($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vsubF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (SubVF (Binary dst_src1 src2) pg));
Expand Down Expand Up @@ -993,6 +1060,22 @@ instruct vmulL_sve(vReg dst_src1, vReg src2) %{

// vector mul - floating-point

instruct vmulHF(vReg dst, vReg src1, vReg src2) %{
match(Set dst (MulVHF src1 src2));
format %{ "vmulHF $dst, $src1, $src2" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ fmul($dst$$FloatRegister, get_arrangement(this),
$src1$$FloatRegister, $src2$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_fmul($dst$$FloatRegister, __ H, $src1$$FloatRegister, $src2$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct vmulF(vReg dst, vReg src1, vReg src2) %{
match(Set dst (MulVF src1 src2));
format %{ "vmulF $dst, $src1, $src2" %}
Expand Down Expand Up @@ -1067,6 +1150,16 @@ instruct vmulL_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vmulHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (MulVHF (Binary dst_src1 src2) pg));
format %{ "vmulHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
ins_encode %{
__ sve_fmul($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vmulF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (MulVF (Binary dst_src1 src2) pg));
Expand All @@ -1091,6 +1184,28 @@ instruct vmulD_masked(vReg dst_src1, vReg src2, pRegGov pg) %{

// vector float div

instruct vdivHF_neon(vReg dst, vReg src1, vReg src2) %{
predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst (DivVHF src1 src2));
format %{ "vdivHF_neon $dst, $src1, $src2" %}
ins_encode %{
__ fdiv($dst$$FloatRegister, get_arrangement(this),
$src1$$FloatRegister, $src2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vdivHF_sve(vReg dst_src1, vReg src2) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst_src1 (DivVHF dst_src1 src2));
format %{ "vdivHF_sve $dst_src1, $dst_src1, $src2" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_fdiv($dst_src1$$FloatRegister, __ H, ptrue, $src2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vdivF_neon(vReg dst, vReg src1, vReg src2) %{
predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst (DivVF src1 src2));
Expand Down Expand Up @@ -1137,6 +1252,16 @@ instruct vdivD_sve(vReg dst_src1, vReg src2) %{

// vector float div - predicated

instruct vdivHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (DivVHF (Binary dst_src1 src2) pg));
format %{ "vdivHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
ins_encode %{
__ sve_fdiv($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vdivF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src1 (DivVF (Binary dst_src1 src2) pg));
Expand Down Expand Up @@ -1605,6 +1730,21 @@ instruct vabsL(vReg dst, vReg src) %{
ins_pipe(pipe_slow);
%}

instruct vabsHF(vReg dst, vReg src) %{
match(Set dst (AbsVHF src));
format %{ "vabsHF $dst, $src" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ fabs($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_fabs($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct vabsF(vReg dst, vReg src) %{
match(Set dst (AbsVF src));
format %{ "vabsF $dst, $src" %}
Expand Down Expand Up @@ -1677,6 +1817,16 @@ instruct vabsL_masked(vReg dst_src, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vabsHF_masked(vReg dst_src, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src (AbsVHF dst_src pg));
format %{ "vabsHF_masked $dst_src, $pg, $dst_src" %}
ins_encode %{
__ sve_fabs($dst_src$$FloatRegister, __ H, $pg$$PRegister, $dst_src$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vabsF_masked(vReg dst_src, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src (AbsVF dst_src pg));
Expand Down Expand Up @@ -1778,6 +1928,21 @@ instruct vnegL(vReg dst, vReg src) %{
ins_pipe(pipe_slow);
%}

instruct vnegHF(vReg dst, vReg src) %{
match(Set dst (NegVHF src));
format %{ "vnegHF $dst, $src" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ fneg($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_fneg($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct vnegF(vReg dst, vReg src) %{
match(Set dst (NegVF src));
format %{ "vnegF $dst, $src" %}
Expand Down Expand Up @@ -1832,6 +1997,16 @@ instruct vnegL_masked(vReg dst_src, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vnegHF_masked(vReg dst_src, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src (NegVHF dst_src pg));
format %{ "vnegHF_masked $dst_src, $pg, $dst_src" %}
ins_encode %{
__ sve_fneg($dst_src$$FloatRegister, __ H, $pg$$PRegister, $dst_src$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vnegF_masked(vReg dst_src, pRegGov pg) %{
predicate(UseSVE > 0);
match(Set dst_src (NegVF dst_src pg));
Expand Down Expand Up @@ -2139,8 +2314,9 @@ instruct vmla_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{

instruct vfmla(vReg dst_src1, vReg src2, vReg src3) %{
predicate(UseFMA);
match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3)));
match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3)));
match(Set dst_src1 (FmaVHF dst_src1 (Binary src2 src3)));
match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3)));
match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3)));
format %{ "vfmla $dst_src1, $src2, $src3" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
Expand All @@ -2162,8 +2338,9 @@ instruct vfmla(vReg dst_src1, vReg src2, vReg src3) %{

instruct vfmad_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
predicate(UseFMA && UseSVE > 0);
match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary src3 pg)));
match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary src3 pg)));
match(Set dst_src1 (FmaVHF (Binary dst_src1 src2) (Binary src3 pg)));
match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary src3 pg)));
match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary src3 pg)));
format %{ "vfmad_masked $dst_src1, $pg, $src2, $src3" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
Expand Down Expand Up @@ -2890,6 +3067,45 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
ins_pipe(pipe_slow);
%}

// reduction addHF
instruct reduce_addHF_neon(iRegINoSp dst, iRegIorL2I hfsrc, vReg vsrc, vReg tmp1, vRegF tmp2) %{
predicate(UseSVE == 0);
match(Set dst (AddReductionVHF hfsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
format %{ "reduce_addHF_neon $dst, $hfsrc, $vsrc\t# KILL $tmp1, $tmp2" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
assert(length_in_bytes == 8 || length_in_bytes == 16, "invalid vector length");

__ fmovwh($tmp2$$FloatRegister, $hfsrc$$Register);
if (length_in_bytes == 8) {
__ faddp($tmp1$$FloatRegister, __ T4H, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
} else {
__ faddp($tmp1$$FloatRegister, __ T8H, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($tmp1$$FloatRegister, __ T4H, $tmp1$$FloatRegister, $tmp1$$FloatRegister);
}
__ faddp($tmp1$$FloatRegister, $tmp1$$FloatRegister, __ H);
__ faddh($tmp2$$FloatRegister, $tmp2$$FloatRegister, $tmp1$$FloatRegister);
__ fmovhw($dst$$Register, $tmp2$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_addHF_sve(iRegINoSp dst, iRegIorL2I hfsrc, vReg vsrc, vRegF tmp) %{
predicate(UseSVE > 0);
match(Set dst (AddReductionVHF hfsrc vsrc));
effect(TEMP tmp);
format %{ "reduce_addHF_sve $dst, $hfsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ fmovwh($tmp$$FloatRegister, $hfsrc$$Register);
__ sve_fadda($tmp$$FloatRegister, __ H, ptrue, $vsrc$$FloatRegister);
__ fmovhw($dst$$Register, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%};

// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
Expand Down Expand Up @@ -4190,6 +4406,20 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
ins_pipe(pipe_slow);
%}

// VectorCastHF2D

instruct vcvtHFtoD_sve(vReg dst, vReg src) %{
predicate(Matcher::vector_length_in_bytes(n) > 16);
match(Set dst (VectorCastHF2D src));
format %{ "vcvtHFtoD_sve $dst, $src" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ D, $src$$FloatRegister, __ H);
__ sve_fcvt($dst$$FloatRegister, __ D, ptrue, $dst$$FloatRegister, __ H);
%}
ins_pipe(pipe_slow);
%}

// VectorCastF2HF

instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
Expand Down Expand Up @@ -4217,6 +4447,20 @@ instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
ins_pipe(pipe_slow);
%}

instruct vcvtDtoHF_sve(vReg dst, vReg src, vReg tmp) %{
predicate(Matcher::vector_length_in_bytes(n->in(1)) > 16);
match(Set dst (VectorCastD2HF src));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "vcvtDtoHF_sve $dst, $src\t# KILL $tmp" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
__ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ D);
__ sve_vector_narrow($dst$$FloatRegister, __ H,
$dst$$FloatRegister, __ D, $tmp$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Replicate ------------------------------------

// replicate from reg
Expand Down
Loading

0 comments on commit 1317dcd

Please sign in to comment.