8305563: [vectorapi]: Initial aarch64 backend implementation for FP16…

… operations Reviewed-by: aph, xgong
jatin-bhateja · Jun 30, 2023 · 1317dcd · 1317dcd
1 parent d0d24f3
commit 1317dcd
Show file tree

Hide file tree

Showing 12 changed files with 1,183 additions and 783 deletions.
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -226,6 +226,20 @@ source %{
           return false;
         }
         break;
+      case Op_AddVHF:
+      case Op_SubVHF:
+      case Op_MulVHF:
+      case Op_DivVHF:
+      case Op_AbsVHF:
+      case Op_NegVHF:
+      case Op_FmaVHF:
+      case Op_AddReductionVHF:
+        // FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
+        // Only the Neon instructions need this check. SVE supports 16-bit floats by default.
+        if (UseSVE > 0 || (VM_Version::supports_fphp() && VM_Version::supports_asimdhp())) {
+          break;
+        }
+        return false;
       default:
         break;
     }
@@ -282,6 +296,7 @@ source %{
       case Op_VectorMaskCmp:
       case Op_LoadVectorGather:
       case Op_StoreVectorScatter:
+      case Op_AddReductionVHF:
       case Op_AddReductionVF:
       case Op_AddReductionVD:
       case Op_AndReductionV:
@@ -572,6 +587,22 @@ instruct vaddL(vReg dst, vReg src1, vReg src2) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vaddHF(vReg dst, vReg src1, vReg src2) %{
+  match(Set dst (AddVHF src1 src2));
+  format %{ "vaddHF $dst, $src1, $src2" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
+      __ fadd($dst$$FloatRegister, get_arrangement(this),
+              $src1$$FloatRegister, $src2$$FloatRegister);
+    } else {
+      assert(UseSVE > 0, "must be sve");
+      __ sve_fadd($dst$$FloatRegister, __ H, $src1$$FloatRegister, $src2$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vaddF(vReg dst, vReg src1, vReg src2) %{
   match(Set dst (AddVF src1 src2));
   format %{ "vaddF $dst, $src1, $src2" %}
@@ -646,6 +677,16 @@ instruct vaddL_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vaddHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddVHF (Binary dst_src1 src2) pg));
+  format %{ "vaddHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
+  ins_encode %{
+    __ sve_fadd($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vaddF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   predicate(UseSVE > 0);
   match(Set dst_src1 (AddVF (Binary dst_src1 src2) pg));
@@ -796,6 +837,22 @@ instruct vsubL(vReg dst, vReg src1, vReg src2) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vsubHF(vReg dst, vReg src1, vReg src2) %{
+  match(Set dst (SubVHF src1 src2));
+  format %{ "vsubHF $dst, $src1, $src2" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
+      __ fsub($dst$$FloatRegister, get_arrangement(this),
+              $src1$$FloatRegister, $src2$$FloatRegister);
+    } else {
+      assert(UseSVE > 0, "must be sve");
+      __ sve_fsub($dst$$FloatRegister, __ H, $src1$$FloatRegister, $src2$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vsubF(vReg dst, vReg src1, vReg src2) %{
   match(Set dst (SubVF src1 src2));
   format %{ "vsubF $dst, $src1, $src2" %}
@@ -870,6 +927,16 @@ instruct vsubL_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vsubHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (SubVHF (Binary dst_src1 src2) pg));
+  format %{ "vsubHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
+  ins_encode %{
+    __ sve_fsub($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vsubF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   predicate(UseSVE > 0);
   match(Set dst_src1 (SubVF (Binary dst_src1 src2) pg));
@@ -993,6 +1060,22 @@ instruct vmulL_sve(vReg dst_src1, vReg src2) %{
 
 // vector mul - floating-point
 
+instruct vmulHF(vReg dst, vReg src1, vReg src2) %{
+  match(Set dst (MulVHF src1 src2));
+  format %{ "vmulHF $dst, $src1, $src2" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
+      __ fmul($dst$$FloatRegister, get_arrangement(this),
+              $src1$$FloatRegister, $src2$$FloatRegister);
+    } else {
+      assert(UseSVE > 0, "must be sve");
+      __ sve_fmul($dst$$FloatRegister, __ H, $src1$$FloatRegister, $src2$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vmulF(vReg dst, vReg src1, vReg src2) %{
   match(Set dst (MulVF src1 src2));
   format %{ "vmulF $dst, $src1, $src2" %}
@@ -1067,6 +1150,16 @@ instruct vmulL_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vmulHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (MulVHF (Binary dst_src1 src2) pg));
+  format %{ "vmulHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
+  ins_encode %{
+    __ sve_fmul($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vmulF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   predicate(UseSVE > 0);
   match(Set dst_src1 (MulVF (Binary dst_src1 src2) pg));
@@ -1091,6 +1184,28 @@ instruct vmulD_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
 
 // vector float div
 
+instruct vdivHF_neon(vReg dst, vReg src1, vReg src2) %{
+  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+  match(Set dst (DivVHF src1 src2));
+  format %{ "vdivHF_neon $dst, $src1, $src2" %}
+  ins_encode %{
+    __ fdiv($dst$$FloatRegister, get_arrangement(this),
+            $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivHF_sve(vReg dst_src1, vReg src2) %{
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+  match(Set dst_src1 (DivVHF dst_src1 src2));
+  format %{ "vdivHF_sve $dst_src1, $dst_src1, $src2" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_fdiv($dst_src1$$FloatRegister, __ H, ptrue, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vdivF_neon(vReg dst, vReg src1, vReg src2) %{
   predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
   match(Set dst (DivVF src1 src2));
@@ -1137,6 +1252,16 @@ instruct vdivD_sve(vReg dst_src1, vReg src2) %{
 
 // vector float div - predicated
 
+instruct vdivHF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (DivVHF (Binary dst_src1 src2) pg));
+  format %{ "vdivHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
+  ins_encode %{
+    __ sve_fdiv($dst_src1$$FloatRegister, __ H, $pg$$PRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vdivF_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
   predicate(UseSVE > 0);
   match(Set dst_src1 (DivVF (Binary dst_src1 src2) pg));
@@ -1605,6 +1730,21 @@ instruct vabsL(vReg dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vabsHF(vReg dst, vReg src) %{
+  match(Set dst (AbsVHF src));
+  format %{ "vabsHF $dst, $src" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
+      __ fabs($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
+    } else {
+      assert(UseSVE > 0, "must be sve");
+      __ sve_fabs($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vabsF(vReg dst, vReg src) %{
   match(Set dst (AbsVF src));
   format %{ "vabsF $dst, $src" %}
@@ -1677,6 +1817,16 @@ instruct vabsL_masked(vReg dst_src, pRegGov pg) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vabsHF_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (AbsVHF dst_src pg));
+  format %{ "vabsHF_masked $dst_src, $pg, $dst_src" %}
+  ins_encode %{
+    __ sve_fabs($dst_src$$FloatRegister, __ H, $pg$$PRegister, $dst_src$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vabsF_masked(vReg dst_src, pRegGov pg) %{
   predicate(UseSVE > 0);
   match(Set dst_src (AbsVF dst_src pg));
@@ -1778,6 +1928,21 @@ instruct vnegL(vReg dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vnegHF(vReg dst, vReg src) %{
+  match(Set dst (NegVHF src));
+  format %{ "vnegHF $dst, $src" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
+      __ fneg($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
+    } else {
+      assert(UseSVE > 0, "must be sve");
+      __ sve_fneg($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vnegF(vReg dst, vReg src) %{
   match(Set dst (NegVF src));
   format %{ "vnegF $dst, $src" %}
@@ -1832,6 +1997,16 @@ instruct vnegL_masked(vReg dst_src, pRegGov pg) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vnegHF_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (NegVHF dst_src pg));
+  format %{ "vnegHF_masked $dst_src, $pg, $dst_src" %}
+  ins_encode %{
+    __ sve_fneg($dst_src$$FloatRegister, __ H, $pg$$PRegister, $dst_src$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vnegF_masked(vReg dst_src, pRegGov pg) %{
   predicate(UseSVE > 0);
   match(Set dst_src (NegVF dst_src pg));
@@ -2139,8 +2314,9 @@ instruct vmla_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
 
 instruct vfmla(vReg dst_src1, vReg src2, vReg src3) %{
   predicate(UseFMA);
-  match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3)));
-  match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3)));
+  match(Set dst_src1 (FmaVHF dst_src1 (Binary src2 src3)));
+  match(Set dst_src1 (FmaVF  dst_src1 (Binary src2 src3)));
+  match(Set dst_src1 (FmaVD  dst_src1 (Binary src2 src3)));
   format %{ "vfmla $dst_src1, $src2, $src3" %}
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this);
@@ -2162,8 +2338,9 @@ instruct vfmla(vReg dst_src1, vReg src2, vReg src3) %{
 
 instruct vfmad_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
   predicate(UseFMA && UseSVE > 0);
-  match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary src3 pg)));
-  match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary src3 pg)));
+  match(Set dst_src1 (FmaVHF (Binary dst_src1 src2) (Binary src3 pg)));
+  match(Set dst_src1 (FmaVF  (Binary dst_src1 src2) (Binary src3 pg)));
+  match(Set dst_src1 (FmaVD  (Binary dst_src1 src2) (Binary src3 pg)));
   format %{ "vfmad_masked $dst_src1, $pg, $src2, $src3" %}
   ins_encode %{
     BasicType bt = Matcher::vector_element_basic_type(this);
@@ -2890,6 +3067,45 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
   ins_pipe(pipe_slow);
 %}
 
+// reduction addHF
+instruct reduce_addHF_neon(iRegINoSp dst, iRegIorL2I hfsrc, vReg vsrc, vReg tmp1, vRegF tmp2) %{
+  predicate(UseSVE == 0);
+  match(Set dst (AddReductionVHF hfsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_addHF_neon $dst, $hfsrc, $vsrc\t# KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == 8 || length_in_bytes == 16, "invalid vector length");
+
+    __ fmovwh($tmp2$$FloatRegister, $hfsrc$$Register);
+    if (length_in_bytes == 8) {
+      __ faddp($tmp1$$FloatRegister, __ T4H, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
+    } else {
+      __ faddp($tmp1$$FloatRegister, __ T8H, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
+      __ faddp($tmp1$$FloatRegister, __ T4H, $tmp1$$FloatRegister, $tmp1$$FloatRegister);
+    }
+    __ faddp($tmp1$$FloatRegister, $tmp1$$FloatRegister, __ H);
+    __ faddh($tmp2$$FloatRegister, $tmp2$$FloatRegister, $tmp1$$FloatRegister);
+    __ fmovhw($dst$$Register, $tmp2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_addHF_sve(iRegINoSp dst, iRegIorL2I hfsrc, vReg vsrc, vRegF tmp) %{
+  predicate(UseSVE > 0);
+  match(Set dst (AddReductionVHF hfsrc vsrc));
+  effect(TEMP tmp);
+  format %{ "reduce_addHF_sve $dst, $hfsrc, $vsrc\t# KILL $tmp" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ fmovwh($tmp$$FloatRegister, $hfsrc$$Register);
+    __ sve_fadda($tmp$$FloatRegister, __ H, ptrue, $vsrc$$FloatRegister);
+    __ fmovhw($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%};
+
 // reduction addF
 // Floating-point addition is not associative, so the rules for AddReductionVF
 // on NEON can't be used to auto-vectorize floating-point reduce-add.
@@ -4190,6 +4406,20 @@ instruct vcvtHFtoF(vReg dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
+// VectorCastHF2D
+
+instruct vcvtHFtoD_sve(vReg dst, vReg src) %{
+  predicate(Matcher::vector_length_in_bytes(n) > 16);
+  match(Set dst (VectorCastHF2D src));
+  format %{ "vcvtHFtoD_sve $dst, $src" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_vector_extend($dst$$FloatRegister, __ D, $src$$FloatRegister, __ H);
+    __ sve_fcvt($dst$$FloatRegister, __ D, ptrue, $dst$$FloatRegister, __ H);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // VectorCastF2HF
 
 instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
@@ -4217,6 +4447,20 @@ instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct vcvtDtoHF_sve(vReg dst, vReg src, vReg tmp) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(1)) > 16);
+  match(Set dst (VectorCastD2HF src));
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "vcvtDtoHF_sve $dst, $src\t# KILL $tmp" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    __ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ D);
+    __ sve_vector_narrow($dst$$FloatRegister, __ H,
+                         $dst$$FloatRegister, __ D, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // ------------------------------ Replicate ------------------------------------
 
 // replicate from reg