Skip to content

Commit

Permalink
C2 compiler support for float16 scalar operations.
Browse files Browse the repository at this point in the history
  • Loading branch information
jatin-bhateja committed Dec 15, 2024
1 parent 6b022bb commit c215eac
Show file tree
Hide file tree
Showing 54 changed files with 2,633 additions and 44 deletions.
80 changes: 80 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3476,6 +3476,22 @@ void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
emit_int16(0x6F, (0xC0 | encode));
}

void Assembler::vmovw(XMMRegister dst, Register src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_MAP5, &attributes, true);
emit_int16(0x6E, (0xC0 | encode));
}

void Assembler::vmovw(Register dst, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_MAP5, &attributes, true);
emit_int16(0x7E, (0xC0 | encode));
}

void Assembler::vmovdqu(XMMRegister dst, Address src) {
assert(UseAVX > 0, "");
InstructionMark im(this);
Expand Down Expand Up @@ -8443,6 +8459,70 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector
emit_operand(dst, src, 0);
}

void Assembler::vaddsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x58, (0xC0 | encode));
}

void Assembler::vsubsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x5C, (0xC0 | encode));
}

void Assembler::vdivsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x5E, (0xC0 | encode));
}

void Assembler::vmulsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x59, (0xC0 | encode));
}

void Assembler::vmaxsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x5F, (0xC0 | encode));
}

void Assembler::vminsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x5D, (0xC0 | encode));
}

void Assembler::vsqrtsh(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_MAP5, &attributes);
emit_int16(0x51, (0xC0 | encode));
}

void Assembler::vfmadd132sh(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_avx512_fp16(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_MAP6, &attributes);
emit_int16((unsigned char)0x99, (0xC0 | encode));
}

void Assembler::vpaddsb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(UseAVX > 0 && (vector_len == Assembler::AVX_512bit || (!needs_evex(dst, nds, src) || VM_Version::supports_avx512vl())), "");
assert(!needs_evex(dst, nds, src) || VM_Version::supports_avx512bw(), "");
Expand Down
15 changes: 15 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,8 @@ class Assembler : public AbstractAssembler {
VEX_OPCODE_0F_38 = 0x2,
VEX_OPCODE_0F_3A = 0x3,
VEX_OPCODE_0F_3C = 0x4,
VEX_OPCODE_MAP5 = 0x5,
VEX_OPCODE_MAP6 = 0x6,
VEX_OPCODE_MASK = 0x1F
};

Expand Down Expand Up @@ -1815,6 +1817,9 @@ class Assembler : public AbstractAssembler {
void movsbl(Register dst, Address src);
void movsbl(Register dst, Register src);

void vmovw(XMMRegister dst, Register src);
void vmovw(Register dst, XMMRegister src);

#ifdef _LP64
void movsbq(Register dst, Address src);
void movsbq(Register dst, Register src);
Expand Down Expand Up @@ -2691,6 +2696,16 @@ class Assembler : public AbstractAssembler {
void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

// FP16 instructions
void vaddsh(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vsubsh(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulsh(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vdivsh(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmaxsh(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vminsh(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vsqrtsh(XMMRegister dst, XMMRegister src);
void vfmadd132sh(XMMRegister dst, XMMRegister src1, XMMRegister src2);

// Saturating packed insturctions.
void vpaddsb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddsw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
Expand Down
12 changes: 12 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6676,6 +6676,18 @@ void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst
}
}

void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
switch(opcode) {
case Op_AddHF: vaddsh(dst, src1, src2); break;
case Op_SubHF: vsubsh(dst, src1, src2); break;
case Op_MulHF: vmulsh(dst, src1, src2); break;
case Op_DivHF: vdivsh(dst, src1, src2); break;
case Op_MaxHF: vmaxsh(dst, src1, src2); break;
case Op_MinHF: vminsh(dst, src1, src2); break;
default: assert(false, "%s", NodeClassNames[opcode]); break;
}
}

void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
switch(elem_bt) {
case T_BYTE:
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@
void vector_rearrange_int_float(BasicType bt, XMMRegister dst, XMMRegister shuffle,
XMMRegister src, int vlen_enc);

void efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);

void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register offset,
Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/x86/vm_version_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_AVX512_BITALG;
_features &= ~CPU_AVX512_IFMA;
_features &= ~CPU_APX_F;
_features &= ~CPU_AVX512_FP16;
}

// Currently APX support is only enabled for targets supporting AVX512VL feature.
Expand Down Expand Up @@ -1078,6 +1079,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_AVX512_BITALG;
_features &= ~CPU_AVX512_IFMA;
_features &= ~CPU_AVX_IFMA;
_features &= ~CPU_AVX512_FP16;
}
}

Expand Down Expand Up @@ -3110,6 +3112,9 @@ uint64_t VM_Version::CpuidInfo::feature_flags() const {
}
if (sef_cpuid7_edx.bits.serialize != 0)
result |= CPU_SERIALIZE;

if (_cpuid_info.sef_cpuid7_edx.bits.avx512_fp16 != 0)
result |= CPU_AVX512_FP16;
}

// ZX features.
Expand Down
12 changes: 8 additions & 4 deletions src/hotspot/cpu/x86/vm_version_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,9 @@ class VM_Version : public Abstract_VM_Version {
serialize : 1,
: 5,
cet_ibt : 1,
: 11;
: 2,
avx512_fp16 : 1,
: 8;
} bits;
};

Expand Down Expand Up @@ -416,8 +418,9 @@ class VM_Version : public Abstract_VM_Version {
decl(CET_SS, "cet_ss", 57) /* Control Flow Enforcement - Shadow Stack */ \
decl(AVX512_IFMA, "avx512_ifma", 58) /* Integer Vector FMA instructions*/ \
decl(AVX_IFMA, "avx_ifma", 59) /* 256-bit VEX-coded variant of AVX512-IFMA*/ \
decl(APX_F, "apx_f", 60) /* Intel Advanced Performance Extensions*/\
decl(SHA512, "sha512", 61) /* SHA512 instructions*/
decl(APX_F, "apx_f", 60) /* Intel Advanced Performance Extensions*/ \
decl(SHA512, "sha512", 61) /* SHA512 instructions*/ \
decl(AVX512_FP16, "avx512_fp16", 62) /* AVX512 FP16 ISA support*/

#define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit),
CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG)
Expand Down Expand Up @@ -753,6 +756,7 @@ class VM_Version : public Abstract_VM_Version {
static bool supports_avx512_bitalg() { return (_features & CPU_AVX512_BITALG) != 0; }
static bool supports_avx512_vbmi() { return (_features & CPU_AVX512_VBMI) != 0; }
static bool supports_avx512_vbmi2() { return (_features & CPU_AVX512_VBMI2) != 0; }
static bool supports_avx512_fp16() { return (_features & CPU_AVX512_FP16) != 0; }
static bool supports_hv() { return (_features & CPU_HV) != 0; }
static bool supports_serialize() { return (_features & CPU_SERIALIZE) != 0; }
static bool supports_f16c() { return (_features & CPU_F16C) != 0; }
Expand Down Expand Up @@ -840,7 +844,7 @@ class VM_Version : public Abstract_VM_Version {

// For AVX CPUs only. f16c support is disabled if UseAVX == 0.
static bool supports_float16() {
return supports_f16c() || supports_avx512vl();
return supports_f16c() || supports_avx512vl() || supports_avx512_fp16();
}

// Check intrinsic support
Expand Down
110 changes: 110 additions & 0 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1461,6 +1461,20 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_AddHF:
case Op_DivHF:
case Op_FmaHF:
case Op_MaxHF:
case Op_MinHF:
case Op_MulHF:
case Op_ReinterpretS2HF:
case Op_ReinterpretHF2S:
case Op_SubHF:
case Op_SqrtHF:
if (!VM_Version::supports_avx512_fp16()) {
return false;
}
break;
case Op_VectorLoadShuffle:
case Op_VectorRearrange:
case Op_MulReductionVI:
Expand Down Expand Up @@ -4521,6 +4535,35 @@ instruct vReplS_reg(vec dst, rRegI src) %{
ins_pipe( pipe_slow );
%}

#ifdef _LP64
instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
match(Set dst (Replicate con));
effect(TEMP rtmp);
format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType bt = Matcher::vector_element_basic_type(this);
assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
__ movl($rtmp$$Register, $con$$constant);
__ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
match(Set dst (Replicate src));
effect(TEMP rtmp);
format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vmovw($rtmp$$Register, $src$$XMMRegister);
__ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#endif

instruct ReplS_mem(vec dst, memory mem) %{
predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
match(Set dst (Replicate (LoadS mem)));
Expand Down Expand Up @@ -10837,3 +10880,70 @@ instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
%}
ins_pipe(pipe_slow);
%}

instruct reinterpretS2HF(regF dst, rRegI src)
%{
match(Set dst (ReinterpretS2HF src));
format %{ "vmovw $dst, $src" %}
ins_encode %{
__ vmovw($dst$$XMMRegister, $src$$Register);
%}
ins_pipe(pipe_slow);
%}

instruct convF2HFAndS2HF(regF dst, regF src)
%{
match(Set dst (ReinterpretS2HF (ConvF2HF src)));
format %{ "convF2HFAndS2HF $dst, $src" %}
ins_encode %{
__ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
%}
ins_pipe(pipe_slow);
%}

instruct reinterpretHF2S(rRegI dst, regF src)
%{
match(Set dst (ReinterpretHF2S src));
format %{ "vmovw $dst, $src" %}
ins_encode %{
__ vmovw($dst$$Register, $src$$XMMRegister);
%}
ins_pipe(pipe_slow);
%}

instruct scalar_sqrt_HF_reg(regF dst, regF src)
%{
match(Set dst (SqrtHF src));
format %{ "scalar_sqrt_fp16 $dst, $src" %}
ins_encode %{
__ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
%}
ins_pipe(pipe_slow);
%}

instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
%{
match(Set dst (AddHF src1 src2));
match(Set dst (DivHF src1 src2));
match(Set dst (MaxHF src1 src2));
match(Set dst (MinHF src1 src2));
match(Set dst (MulHF src1 src2));
match(Set dst (SubHF src1 src2));
format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
ins_encode %{
int opcode = this->ideal_Opcode();
__ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
%}
ins_pipe(pipe_slow);
%}

instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
%{
match(Set dst (FmaHF src2 (Binary dst src1)));
effect(DEF dst);
format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
ins_encode %{
__ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
Loading

0 comments on commit c215eac

Please sign in to comment.