diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index b348d274e60251..2f4eb469b32e72 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1122,15 +1122,12 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_ins_g(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, (((index) << 1) | 0b1) << (type), 0b0011, (rd), (rn)) #define arm_neon_ins_e(p, type, rd, rn, indexd, indexs) arm_neon_cpy_opcode ((p), 0b1, 0b1, (((indexd) << 1) | 0b1) << (type), (indexs) << (type), (rd), (rn)) -// Specific opcodes: -#define arm_neon_dup_e_8b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_16b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_4h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_8h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_2s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_4s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_2d(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 4), 0b0000, (rd), (rn)) +#define arm_neon_smov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I32) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0101, (rd), (rn)) +#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0111, (rd), (rn)) +#define arm_neon_dup_e(p, width, type, rd, rn, index) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)) | ((index) << ((type)+1)), 0b0000, (rd), (rn)) +#define arm_neon_fdup_e(p, width, type, rd, rn, index) arm_neon_dup_e ((p), (width), (type) + TYPE_I32, (rd), (rn), (index)) +// Specific opcodes: #define arm_neon_dup_g_8b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001, 0b0001, (rd), (rn)) #define arm_neon_dup_g_16b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001, 0b0001, (rd), (rn)) #define arm_neon_dup_g_4h(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010, 0b0001, (rd), (rn)) @@ -1141,15 +1138,13 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) #define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) -#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0101, (rd), (rn)) +#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) #define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) #define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) #define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) #define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0111, (rd), (rn)) - /* NEON :: 3-register same FP16 */ // TODO @@ -1572,6 +1567,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) /* NEON :: across lanes */ #define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn)) +#define arm_neon_addv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b0, (type), 0b11011, (rd), (rn)) #define arm_neon_umaxv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn)) #define arm_neon_uminv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn)) @@ -1595,12 +1591,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sminv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11010, (rd), (rn)) #define arm_neon_sminv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11010, (rd), (rn)) -#define arm_neon_addv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11011, (rd), (rn)) -#define arm_neon_addv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11011, (rd), (rn)) -#define arm_neon_addv_4h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b11011, (rd), (rn)) -#define arm_neon_addv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11011, (rd), (rn)) -#define arm_neon_addv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11011, (rd), (rn)) - // some fp16 opcodes here: fmaxnmv, fmaxv, fminnmv, fminv #define arm_neon_uaddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b00011, (rd), (rn)) @@ -1809,6 +1799,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_cmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b10001, (rd), (rn), (rm)) #define arm_neon_cmhi(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00110, (rd), (rn), (rm)) #define arm_neon_cmhs(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00111, (rd), (rn), (rm)) +#define arm_neon_addp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b10111, (rd), (rn), (rm)) // Generalized macros for float ops: // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} @@ -1822,6 +1813,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fcmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_fcmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_fcmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11100, (rd), (rn), (rm)) +#define arm_neon_faddp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn), (rm)) // Generalized macros for bitwise ops: // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} @@ -1991,14 +1983,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sqdmulh_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm)) #define arm_neon_sqdmulh_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm)) -#define arm_neon_addp_8b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_16b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_4h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_8h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b10111, (rd), (rn), (rm)) - #define arm_neon_fmaxnm_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm)) #define arm_neon_fmaxnm_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm)) #define arm_neon_fmaxnm_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11000, (rd), (rn), (rm)) @@ -2234,10 +2218,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fmaxnmp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11000, (rd), (rn), (rm)) #define arm_neon_fmaxnmp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11000, (rd), (rn), (rm)) -#define arm_neon_faddp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm)) -#define arm_neon_faddp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm)) -#define arm_neon_faddp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11010, (rd), (rn), (rm)) - #define arm_neon_fmul_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm)) #define arm_neon_fmul_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm)) #define arm_neon_fmul_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11011, (rd), (rn), (rm)) diff --git a/src/mono/mono/arch/arm64/codegen-test.c b/src/mono/mono/arch/arm64/codegen-test.c index b89adf007f2eea..b50a6432cf252a 100644 --- a/src/mono/mono/arch/arm64/codegen-test.c +++ b/src/mono/mono/arch/arm64/codegen-test.c @@ -477,6 +477,11 @@ main (int argc, char *argv []) arm_neon_ins_e (code, TYPE_I8, ARMREG_R0, ARMREG_R1, 1, 5); // insert v1.b[5] into v0.b[1] arm_neon_ins_e (code, TYPE_I32, ARMREG_R0, ARMREG_R1, 1, 2); // insert v1.s[2] into v0.s[1] + // pairwise and horizontal adds + arm_neon_addv (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1); + arm_neon_addp (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2); + arm_neon_faddp (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2); + for (i = 0; i < code - buf; ++i) printf (".byte %d\n", buf [i]); printf ("\n"); diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index b6f8f23a64c257..39716de51ccfb3 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -506,6 +506,13 @@ ones_complement: dest:x src1:x len:4 xextract: dest:i src1:x len:12 xbinop_forceint: dest:x src1:x src2:x len:4 xcast: dest:x src1:x len:4 clob:1 +extract_i1: dest:i src1:x len:4 +extract_i2: dest:i src1:x len:4 +extract_i4: dest:i src1:x len:4 +extract_i8: dest:i src1:x len:4 +extract_r4: dest:f src1:x len:4 +extract_r8: dest:f src1:x len:4 +arm64_xaddv: dest:x src1:x len:8 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 57b140565ff6b1..55c26e0e4f35eb 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -27,6 +27,7 @@ #include #include #include +#include "llvm-intrinsics-types.h" #include "interp/interp.h" @@ -35,6 +36,7 @@ #define PARENTHESIZE(...) (__VA_ARGS__) #define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__)) #define OPFMT_WDSS _w, dreg, sreg1, sreg2 +#define OPFMT_WTDS _w, _t, dreg, sreg1 #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2 #define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1 #define _UNDEF(...) g_assert_not_reached () @@ -3466,6 +3468,12 @@ is_type_float_macro (MonoTypeEnum type) return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8); } +static gboolean +is_type_unsigned_macro (MonoTypeEnum type) +{ + return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8); +} + static int get_vector_size_macro (MonoInst *ins) { @@ -3736,6 +3744,56 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XCAST: break; + case OP_EXTRACT_I1: + case OP_EXTRACT_I2: + case OP_EXTRACT_I4: + case OP_EXTRACT_I8: { + const int t = get_type_size_macro (ins->inst_c1); + // smov is not defined for i64 + if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) { + arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); + } else { + arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); + } + break; + } + case OP_EXTRACT_R4: + case OP_EXTRACT_R8: + if (ins->dreg != ins->sreg1 || ins->inst_c0 != 0) { + const int t = get_type_size_macro (ins->inst_c1); + // Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should + // set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest + // of the F/XREG is ignored in FREG mode, this operation remains valid. + arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0); + } + break; + case OP_ARM64_XADDV: { + switch (ins->inst_c0) { + case INTRINS_AARCH64_ADV_SIMD_FADDV: + if (ins->inst_c1 == MONO_TYPE_R8) { + arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1); + } else if (ins->inst_c1 == MONO_TYPE_R4) { + arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1); + arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg); + } else { + g_assert_not_reached (); + } + break; + + case INTRINS_AARCH64_ADV_SIMD_UADDV: + case INTRINS_AARCH64_ADV_SIMD_SADDV: + if (get_type_size_macro (ins->inst_c1) == TYPE_I64) + arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1); + else + g_assert_not_reached (); // remaining int types are handled through the codegen table + break; + + default: + g_assert_not_reached (); + } + break; + } + /* BRANCH */ case OP_BR: mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb, MONO_R_ARM64_B); diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h index 4af1a21892d4c7..21836b05103b12 100644 --- a/src/mono/mono/mini/simd-arm64.h +++ b/src/mono/mono/mini/simd-arm64.h @@ -62,3 +62,6 @@ SIMD_OP (128, OP_XBINOP, OP_FMIN, WTDSS, _UNDEF, SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_AND, WDSS, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and) SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_OR, WDSS, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr) SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_XOR, WDSS, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor) +SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_UADDV, WTDS, arm_neon_addv, arm_neon_addv, arm_neon_addv, _SKIP, _UNDEF, _UNDEF) +SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_SADDV, WTDS, arm_neon_addv, arm_neon_addv, arm_neon_addv, _SKIP, _UNDEF, _UNDEF) +SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, _SKIP, _SKIP) \ No newline at end of file diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 747e6b237b4b84..168fff859c935c 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -620,15 +620,23 @@ emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_t return ins; } - MonoInst *ins = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1); - + MonoInst *sum = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1); if (type_enum_is_float (element_type)) { - ins->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + sum->inst_c1 = element_type; } else { - ins->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV; + sum->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV; + sum->inst_c1 = element_type; } - return ins; + if (COMPILE_LLVM (cfg)) { + return sum; + } else { + MonoInst *ins = emit_simd_ins (cfg, vector_class, type_to_extract_op (element_type), sum->dreg, -1); + ins->inst_c0 = 0; + ins->inst_c1 = element_type; + return ins; + } } #endif #ifdef TARGET_WASM @@ -1250,6 +1258,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_AsUInt64: case SN_Max: case SN_Min: + case SN_Sum: + case SN_ToScalar: break; default: return NULL;