Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mono] Enable the supported V128 SIMD intrinsics on Arm64 across all codegen engines #84289

Merged
merged 6 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
/* NEON :: extract */
#define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm))

#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm))

/* NEON :: copy */
#define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn))
Expand Down
2 changes: 2 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ expand_i4: dest:x src1:i len:4
expand_i8: dest:x src1:i len:4
expand_r4: dest:x src1:f len:4
expand_r8: dest:x src1:f len:4
create_scalar: dest:x src1:i len:12
create_scalar_unsafe: dest:x src1:i len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
147 changes: 93 additions & 54 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
Expand All @@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
arm_ldrfpq (code, dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
arm_neon_ldrq_lit (code, dreg, 0);
}
break;
}
Expand All @@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_EXPAND_I4:
case OP_EXPAND_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1);
break;
}
case OP_EXPAND_R4:
case OP_EXPAND_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0);
break;
}
case OP_EXTRACT_I1:
Expand All @@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
const int t = get_type_size_macro (ins->inst_c1);
// smov is not defined for i64
if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0);
} else {
arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0);
}
break;
}
Expand All @@ -3773,17 +3815,27 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
// of the F/XREG is ignored in FREG mode, this operation remains valid.
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0);
}
break;
case OP_INSERT_I1:
case OP_INSERT_I2:
case OP_INSERT_I4:
case OP_INSERT_I8:
case OP_INSERT_R4:
case OP_INSERT_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
break;
}
case OP_ARM64_XADDV: {
switch (ins->inst_c0) {
case INTRINS_AARCH64_ADV_SIMD_FADDV:
if (ins->inst_c1 == MONO_TYPE_R8) {
arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1);
} else if (ins->inst_c1 == MONO_TYPE_R4) {
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg);
} else {
g_assert_not_reached ();
}
Expand All @@ -3792,7 +3844,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case INTRINS_AARCH64_ADV_SIMD_UADDV:
case INTRINS_AARCH64_ADV_SIMD_SADDV:
if (get_type_size_macro (ins->inst_c1) == TYPE_I64)
arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1);
else
g_assert_not_reached (); // remaining int types are handled through the codegen table
break;
Expand All @@ -3802,6 +3854,36 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_CREATE_SCALAR: {
int t = get_type_size_macro (ins->inst_c1);
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
if (is_type_float_macro (ins->inst_c1)) {
// ins expects an integer register
arm_fmov_double_to_rx(code, NEON_TMP_REG, sreg1);
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_ins_g(code, t, dreg, NEON_TMP_REG, 0);
} else {
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
}
break;
}
case OP_CREATE_SCALAR_UNSAFE: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
break;
}
// Enable this when adding support for Narrow and enable support for Create at the same time
// case OP_XCONCAT:
// arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8);
// break;

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3875,49 +3957,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
arm_cbnzx (code, sreg1, 0);
break;

/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;

/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down
96 changes: 31 additions & 65 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -1097,11 +1097,6 @@ static guint16 sri_vector_methods [] = {
SN_AsUInt16,
SN_AsUInt32,
SN_AsUInt64,
SN_AsVector128,
vargaz marked this conversation as resolved.
Show resolved Hide resolved
SN_AsVector2,
SN_AsVector256,
SN_AsVector3,
SN_AsVector4,
SN_BitwiseAnd,
SN_BitwiseOr,
SN_Ceiling,
Expand Down Expand Up @@ -1150,8 +1145,6 @@ static guint16 sri_vector_methods [] = {
SN_ToScalar,
SN_ToVector128,
SN_ToVector128Unsafe,
SN_ToVector256,
SN_ToVector256Unsafe,
SN_WidenLower,
SN_WidenUpper,
SN_WithElement,
Expand Down Expand Up @@ -1216,76 +1209,47 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
if (!COMPILE_LLVM (cfg))
return NULL;
#endif
// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
#ifdef TARGET_ARM64
if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
return NULL;
#endif

int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod);
if (id == -1) {
//check_no_intrinsic_cattr (cmethod);
return NULL;
}

if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256") || !strcmp (m_class_get_name (cfg->method->klass), "Vector512"))
if (!strcmp (m_class_get_name (cmethod->klass), "Vector256") || !strcmp (m_class_get_name (cmethod->klass), "Vector512"))
return NULL;

// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
#ifdef TARGET_ARM64
if (!COMPILE_LLVM (cfg)) {
if (!(!strcmp (m_class_get_name (cmethod->klass), "Vector128") || !strcmp (m_class_get_name (cmethod->klass), "Vector")))
return NULL;
switch (id) {
case SN_Add:
case SN_Equals:
case SN_GreaterThan:
case SN_GreaterThanOrEqual:
case SN_LessThan:
case SN_LessThanOrEqual:
case SN_Negate:
case SN_OnesComplement:
case SN_EqualsAny:
case SN_GreaterThanAny:
case SN_GreaterThanOrEqualAny:
case SN_LessThanAny:
case SN_LessThanOrEqualAny:
case SN_EqualsAll:
case SN_GreaterThanAll:
case SN_GreaterThanOrEqualAll:
case SN_LessThanAll:
case SN_LessThanOrEqualAll:
case SN_Subtract:
case SN_BitwiseAnd:
case SN_BitwiseOr:
case SN_Xor:
case SN_As:
case SN_AsByte:
case SN_AsDouble:
case SN_AsInt16:
case SN_AsInt32:
case SN_AsInt64:
case SN_AsSByte:
case SN_AsSingle:
case SN_AsUInt16:
case SN_AsUInt32:
case SN_AsUInt64:
case SN_Max:
case SN_Min:
case SN_Sum:
case SN_ToScalar:
case SN_Floor:
case SN_Ceiling:
case SN_Divide:
case SN_Multiply:
case SN_Sqrt:
case SN_Abs:
break;
default:
case SN_AndNot:
case SN_ConditionalSelect:
case SN_ConvertToDouble:
case SN_ConvertToInt32:
case SN_ConvertToInt64:
case SN_ConvertToSingle:
case SN_ConvertToUInt32:
case SN_ConvertToUInt64:
case SN_Create:
case SN_Dot:
case SN_ExtractMostSignificantBits:
case SN_GetElement:
case SN_GetLower:
case SN_GetUpper:
case SN_Narrow:
case SN_Shuffle:
case SN_ToVector128:
case SN_ToVector128Unsafe:
case SN_WidenLower:
case SN_WidenUpper:
case SN_WithElement:
return NULL;
default:
break;
}
MonoClass *arg0_class = mono_class_from_mono_type_internal (fsig->params [0]);
int class_size = mono_class_value_size (arg0_class, NULL);
if (class_size != 16)
return NULL;
}
#endif

Expand Down Expand Up @@ -1462,9 +1426,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
MonoType *etype = get_vector_t_elem_type (fsig->ret);
if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
return NULL;
if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
else if (is_create_from_half_vectors_overload (fsig))
if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) {
MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
ins->inst_c1 = arg0_type;
return ins;
} else if (is_create_from_half_vectors_overload (fsig))
return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg);
else if (is_elementwise_create_overload (fsig, etype))
return emit_vector_create_elementwise (cfg, fsig, fsig->ret, etype, args);
Expand Down