From a5f8a381ccb499b955a0ede518d5ac05dbef3874 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 21 Feb 2024 16:49:32 +0100 Subject: [PATCH 01/15] [Mono] [Arm64] Added multiple vector instrinsics --- src/mono/mono/arch/arm64/arm64-codegen.h | 2 + src/mono/mono/mini/simd-arm64.h | 4 + src/mono/mono/mini/simd-intrinsics.c | 205 ++++++++++++++++++++++- 3 files changed, 203 insertions(+), 8 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index f749f5be8eff1..ddfa2f7736c16 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1270,6 +1270,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fabs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, 0b10 | (type), 0b01111, (rd), (rn)) #define arm_neon_fneg(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b01111, (rd), (rn)) #define arm_neon_fsqrt(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11111, (rd), (rn)) +#define arm_neon_frsqrte(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11101, (rd), (rn)) #define arm_neon_fcvtn(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn)) #define arm_neon_fcvtn2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn)) #define arm_neon_fcvtl(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn)) @@ -1845,6 +1846,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fcmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_fcmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_faddp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn), (rm)) +#define arm_neon_frsqrts(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, 0b10 | (type), 0b11111, (rd), (rn), (rm)) // Generalized macros for bitwise ops: // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h index b9a5b9dc9f641..0eef05003e429 100644 --- a/src/mono/mono/mini/simd-arm64.h +++ b/src/mono/mono/mini/simd-arm64.h @@ -37,6 +37,8 @@ SIMD_OP (64, OP_XBINOP, OP_FDIV, WTDSS, _UNDEF, SIMD_OP (64, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, _SKIP, _UNDEF) SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fsqrt, _UNDEF) SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fabs, _UNDEF) +SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRSQRTE,WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrte, _UNDEF) +SIMD_OP (64, OP_XBINOP, INTRINS_AARCH64_ADV_SIMD_FRSQRTS,WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrts, _UNDEF) /* 128-bit vectors */ /* Width Opcode Function Operand config I8 I16 I32 I64 F32 F64 */ @@ -91,3 +93,5 @@ SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_SIMD_FLOOR, WTDS, _UNDEF, _U SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fsqrt, arm_neon_fsqrt) SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_ABS, WTDS, arm_neon_abs, arm_neon_abs, arm_neon_abs, arm_neon_abs, _UNDEF, _UNDEF) SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fabs, arm_neon_fabs) +SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRSQRTE,WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrte, arm_neon_frsqrte) +SIMD_OP (128, OP_XBINOP, INTRINS_AARCH64_ADV_SIMD_FRSQRTS,WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrts, arm_neon_frsqrts) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index fe8cdaa1dd69a..546350b812453 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -688,6 +688,26 @@ emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_t return ins; } } + +static MonoInst* +emit_sum_sqrt_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg) { + MonoInst *sum = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, arg->dreg, -1); + sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + sum->inst_c1 = MONO_TYPE_R4; + + MonoInst* sum_sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); + sum_sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; + sum_sqrt->inst_c1 = MONO_TYPE_R4; + + if (COMPILE_LLVM (cfg)) { + return sum_sqrt; + } else { + MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sum_sqrt->dreg, -1); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_R4; + return ins; + } +} #endif #ifdef TARGET_WASM static MonoInst* emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_type, MonoInst *arg); @@ -1087,6 +1107,58 @@ emit_vector_insert_element ( return ins; } +#if defined(TARGET_ARM64) +static MonoInst* +emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg){ + MonoInst *vec_squared = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, arg->dreg); + vec_squared->inst_c0 = OP_FMUL; + vec_squared->inst_c1 = MONO_TYPE_R4; + + const char *class_name = m_class_get_name (klass); + if (!strcmp ("Plane", class_name)) { + static float r4_0 = 0; + MonoInst *zero; + int zero_dreg = alloc_freg (cfg); + MONO_INST_NEW (cfg, zero, OP_R4CONST); + zero->inst_p0 = (void*)&r4_0; + zero->dreg = zero_dreg; + MONO_ADD_INS (cfg->cbb, zero); + vec_squared = emit_vector_insert_element (cfg, klass, vec_squared, MONO_TYPE_R4, zero, 3, FALSE); + } + + MonoInst *sum = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, vec_squared->dreg, -1); + sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + sum->inst_c1 = MONO_TYPE_R4; + + MonoInst *recip_sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); + recip_sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FRSQRTE; + recip_sqrt->inst_c1 = MONO_TYPE_R4; + + + MonoInst *recip_sqrt_2, *corr; + + for (int i = 0; i < 2; i++) { + recip_sqrt_2 = emit_simd_ins (cfg, klass, OP_XBINOP, recip_sqrt->dreg, recip_sqrt->dreg); + recip_sqrt_2->inst_c0 = OP_FMUL; + recip_sqrt_2->inst_c1 = MONO_TYPE_R4; + + corr = emit_simd_ins (cfg, klass, OP_XBINOP, sum->dreg, recip_sqrt_2->dreg); + corr->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FRSQRTS; + corr->inst_c1 = MONO_TYPE_R4; + + recip_sqrt = emit_simd_ins (cfg, klass, OP_XBINOP, recip_sqrt->dreg, corr->dreg); + recip_sqrt->inst_c0 = OP_FMUL; + recip_sqrt->inst_c1 = MONO_TYPE_R4; + } + + MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, recip_sqrt->dreg); + normalized_vec->inst_c0 = OP_FMUL; + normalized_vec->inst_c1 = MONO_TYPE_R4; + + return normalized_vec; +} +#endif + static MonoInst * emit_vector_create_elementwise ( MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, @@ -2749,6 +2821,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f etype = m_class_get_byval_arg (mono_defaults.single_class); len = mono_class_value_size (klass, NULL) / 4; + const char *class_name = m_class_get_name (klass); #ifndef TARGET_ARM64 if (!COMPILE_LLVM (cfg)) return NULL; @@ -2926,6 +2999,8 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f value [1] = 1.0f; value [2] = 1.0f; value [3] = 1.0f; + if (len == 3) + value [3] = 0.0f; return emit_xconst_v128 (cfg, klass, (guint8*)value); } case SN_set_Item: { @@ -3076,9 +3151,42 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f return NULL; #endif } - case SN_CopyTo: - // FIXME: https://github.com/dotnet/runtime/issues/91394 - return NULL; + case SN_CopyTo: { +#if defined(TARGET_ARM64) + MonoInst *index_ins; + int val_vreg, end_index_reg; + val_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL); + + if (fsig->param_count == 2) { + index_ins = args [2]; + } else { + EMIT_NEW_ICONST (cfg, index_ins, 0); + } + + MonoInst *ldelema_ins; + if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) { + MonoInst *array_ins = args [1]; + /* CopyTo () does complicated argument checks */ + mini_emit_bounds_check_offset (cfg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length), index_ins->dreg, "ArgumentOutOfRangeException", FALSE); + end_index_reg = alloc_ireg (cfg); + int len_reg = alloc_ireg (cfg); + MONO_EMIT_NEW_LOAD_MEMBASE_OP_FLAGS (cfg, OP_LOADI4_MEMBASE, len_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length), MONO_INST_INVARIANT_LOAD); + EMIT_NEW_BIALU (cfg, ins, OP_ISUB, end_index_reg, len_reg, index_ins->dreg); + MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, end_index_reg, len); + MONO_EMIT_NEW_COND_EXC (cfg, LT, "ArgumentException"); + + /* Load the array slice into the simd reg */ + ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type_internal (etype), array_ins, index_ins, FALSE, FALSE); + EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, val_vreg); + ins->klass = cmethod->klass; + return ins; + } else { + //TODO: CopyTo(Span) + return NULL; + } +#endif + } + break; case SN_Clamp: { if (!(!fsig->hasthis && fsig->param_count == 3 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type) && mono_metadata_type_equal (fsig->params [2], type))) return NULL; @@ -3093,13 +3201,94 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f return min; } - case SN_Conjugate: - case SN_Distance: - case SN_DistanceSquared: + case SN_Distance: + case SN_DistanceSquared: { +#if defined(TARGET_ARM64) + MonoInst *diffs = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FSUB, MONO_TYPE_R4, fsig, args); + MonoInst *diffs_squared = emit_simd_ins (cfg, klass, OP_XBINOP, diffs->dreg, diffs->dreg); + diffs_squared->inst_c0 = OP_FMUL; + diffs_squared->inst_c1 = MONO_TYPE_R4; + + switch (id) { + case SN_Distance: + return emit_sum_sqrt_vector_2_3_4 (cfg, klass, diffs_squared); + case SN_DistanceSquared: + return emit_sum_vector (cfg, fsig->params [0], MONO_TYPE_R4, diffs_squared); + default: + g_assert_not_reached (); + } +#endif + } + break; case SN_Length: - case SN_LengthSquared: - case SN_Lerp: + case SN_LengthSquared: { +#if defined (TARGET_ARM64) + int src1 = load_simd_vreg (cfg, cmethod, args [0], NULL); + + MonoInst *vec_squared = emit_simd_ins (cfg, klass, OP_XBINOP, src1, src1); + vec_squared->inst_c0 = OP_FMUL; + vec_squared->inst_c1 = MONO_TYPE_R4; + + switch (id) { + case SN_Length: + return emit_sum_sqrt_vector_2_3_4 (cfg, klass, vec_squared); + case SN_LengthSquared: + return emit_sum_vector (cfg, type, MONO_TYPE_R4, vec_squared); + default: + g_assert_not_reached (); + } +#endif + } + break; + case SN_Lerp: { +#if defined (TARGET_ARM64) + MonoInst* v1 = args [1]; + if (!strcmp ("Quaternion", class_name)) { + MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, MONO_TYPE_R4, fsig, args); + pairwise_multiply->sreg3 = -1; + MonoInst *dot = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, pairwise_multiply->dreg, -1); + dot->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + dot->inst_c1 = MONO_TYPE_R4; + + MonoInst* zeros = emit_xzero (cfg, klass); + + MonoInst* ge_0 = emit_simd_ins (cfg, klass, OP_XCOMPARE_FP, dot->dreg, zeros->dreg); + ge_0->inst_c0 = CMP_GE; + ge_0->inst_c1 = MONO_TYPE_R4; + + MonoInst* negated_v1 = emit_simd_ins (cfg, klass, OP_NEGATION, args [1]->dreg, -1); + negated_v1->inst_c1 = MONO_TYPE_R4; + + v1 = emit_simd_ins (cfg, klass, OP_BSL, ge_0->dreg, args [1]->dreg); + v1->sreg3 = negated_v1->dreg; + v1->inst_c1 = MONO_TYPE_R4; + } + + MonoInst *diffs = emit_simd_ins (cfg, klass, OP_XBINOP, v1->dreg, args [0]->dreg); + diffs->inst_c0 = OP_FSUB; + diffs->inst_c1 = MONO_TYPE_R4; + + MonoInst *scaled_diffs = handle_mul_div_by_scalar (cfg, klass, MONO_TYPE_R4, args [2]->dreg, diffs->dreg, OP_FMUL); + + MonoInst *result = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, scaled_diffs->dreg); + result->inst_c0 = OP_FADD; + result->inst_c1 = MONO_TYPE_R4; + + if (!strcmp ("Quaternion", class_name)) { + return emit_normalize_vector_2_3_4 (cfg, klass, result); + } + + return result; +#endif + } + break; case SN_Normalize: { +#if defined (TARGET_ARM64) + return emit_normalize_vector_2_3_4 (cfg, klass, args[0]); +#endif + } + break; + case SN_Conjugate: { // FIXME: https://github.com/dotnet/runtime/issues/91394 return NULL; } From 8a3cee4de0f1193dd88e30875cdbc03716caac0d Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 21 Feb 2024 19:03:18 +0100 Subject: [PATCH 02/15] Added LLVM support --- src/mono/mono/mini/mini-llvm.c | 8 +++- src/mono/mono/mini/simd-intrinsics.c | 57 +++++++++++++++++----------- 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 5db5828eaaa91..bc5bb938a547b 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -8486,7 +8486,13 @@ MONO_RESTORE_WARNING #endif break; } - + case INTRINS_AARCH64_ADV_SIMD_FRSQRTS: { + IntrinsicId iid = (IntrinsicId) ins->inst_c0; + LLVMValueRef call_args [] = { l, r }; + llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass); + result = call_overloaded_intrins (ctx, iid, ovr_tag, call_args, ""); + break; + } default: g_assert_not_reached (); } diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 546350b812453..744804c7e3c09 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -695,18 +695,19 @@ emit_sum_sqrt_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg) { sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; sum->inst_c1 = MONO_TYPE_R4; + if (COMPILE_LLVM (cfg)) { + sum = emit_simd_ins (cfg, klass, OP_EXPAND_R4, sum->dreg, -1); + sum->inst_c1 = MONO_TYPE_R4; + } + MonoInst* sum_sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); sum_sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; sum_sqrt->inst_c1 = MONO_TYPE_R4; - if (COMPILE_LLVM (cfg)) { - return sum_sqrt; - } else { - MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sum_sqrt->dreg, -1); - ins->inst_c0 = 0; - ins->inst_c1 = MONO_TYPE_R4; - return ins; - } + MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sum_sqrt->dreg, -1); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_R4; + return ins; } #endif #ifdef TARGET_WASM @@ -1130,6 +1131,11 @@ emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg){ sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; sum->inst_c1 = MONO_TYPE_R4; + if (COMPILE_LLVM (cfg)) { + sum = emit_simd_ins (cfg, klass, OP_EXPAND_R4, sum->dreg, -1); + sum->inst_c1 = MONO_TYPE_R4; + } + MonoInst *recip_sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); recip_sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FRSQRTE; recip_sqrt->inst_c1 = MONO_TYPE_R4; @@ -2821,7 +2827,6 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f etype = m_class_get_byval_arg (mono_defaults.single_class); len = mono_class_value_size (klass, NULL) / 4; - const char *class_name = m_class_get_name (klass); #ifndef TARGET_ARM64 if (!COMPILE_LLVM (cfg)) return NULL; @@ -3153,19 +3158,20 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f } case SN_CopyTo: { #if defined(TARGET_ARM64) - MonoInst *index_ins; - int val_vreg, end_index_reg; - val_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL); + if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) { + MonoInst *index_ins; + int val_vreg, end_index_reg; + val_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL); - if (fsig->param_count == 2) { - index_ins = args [2]; - } else { - EMIT_NEW_ICONST (cfg, index_ins, 0); - } + if (fsig->param_count == 2) { + index_ins = args [2]; + } else { + EMIT_NEW_ICONST (cfg, index_ins, 0); + } - MonoInst *ldelema_ins; - if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) { + MonoInst *ldelema_ins; MonoInst *array_ins = args [1]; + /* CopyTo () does complicated argument checks */ mini_emit_bounds_check_offset (cfg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length), index_ins->dreg, "ArgumentOutOfRangeException", FALSE); end_index_reg = alloc_ireg (cfg); @@ -3181,7 +3187,8 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f ins->klass = cmethod->klass; return ins; } else { - //TODO: CopyTo(Span) + // CopyTo(Span) + // Not intrinsified on coreclr return NULL; } #endif @@ -3243,13 +3250,18 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f case SN_Lerp: { #if defined (TARGET_ARM64) MonoInst* v1 = args [1]; - if (!strcmp ("Quaternion", class_name)) { + if (!strcmp ("Quaternion", m_class_get_name (klass))) { MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, MONO_TYPE_R4, fsig, args); pairwise_multiply->sreg3 = -1; MonoInst *dot = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, pairwise_multiply->dreg, -1); dot->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; dot->inst_c1 = MONO_TYPE_R4; + if (COMPILE_LLVM (cfg)) { + dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); + dot->inst_c1 = MONO_TYPE_R4; + } + MonoInst* zeros = emit_xzero (cfg, klass); MonoInst* ge_0 = emit_simd_ins (cfg, klass, OP_XCOMPARE_FP, dot->dreg, zeros->dreg); @@ -3274,7 +3286,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f result->inst_c0 = OP_FADD; result->inst_c1 = MONO_TYPE_R4; - if (!strcmp ("Quaternion", class_name)) { + if (!strcmp ("Quaternion", m_class_get_name (klass))) { return emit_normalize_vector_2_3_4 (cfg, klass, result); } @@ -3289,7 +3301,6 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f } break; case SN_Conjugate: { - // FIXME: https://github.com/dotnet/runtime/issues/91394 return NULL; } default: From 87d97d9daa426d11d24303a6ae7e8311cc37af4a Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 21 Feb 2024 19:45:12 +0100 Subject: [PATCH 03/15] fix build errors on x64 --- src/mono/mono/mini/mini-llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index bc5bb938a547b..08bbc62300907 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -8486,6 +8486,7 @@ MONO_RESTORE_WARNING #endif break; } +#if defined(TARGET_ARM64) case INTRINS_AARCH64_ADV_SIMD_FRSQRTS: { IntrinsicId iid = (IntrinsicId) ins->inst_c0; LLVMValueRef call_args [] = { l, r }; @@ -8493,6 +8494,7 @@ MONO_RESTORE_WARNING result = call_overloaded_intrins (ctx, iid, ovr_tag, call_args, ""); break; } +#endif default: g_assert_not_reached (); } From 5b1fa3df3232e06e414e31ef0a856f7fc8864c30 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 21 Feb 2024 22:57:42 +0100 Subject: [PATCH 04/15] Added Quaternion.Conjugate --- src/mono/mono/mini/simd-intrinsics.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 744804c7e3c09..40b8b2510163d 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -634,6 +634,7 @@ emit_xconst_v128 (MonoCompile *cfg, MonoClass *klass, guint8 value[16]) ins->type = STACK_VTYPE; ins->dreg = alloc_xreg (cfg); ins->inst_p0 = mono_mem_manager_alloc (cfg->mem_manager, size); + ins->klass = klass; MONO_ADD_INS (cfg->cbb, ins); memcpy (ins->inst_p0, &value[0], size); @@ -3301,8 +3302,20 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f } break; case SN_Conjugate: { - return NULL; +#if defined (TARGET_ARM64) + float value[4]; + value [0] = -1.0f; + value [1] = -1.0f; + value [2] = -1.0f; + value [3] = 1.0f; + MonoInst* r = emit_xconst_v128 (cfg, klass, (guint8*)value); + MonoInst* result = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, r->dreg); + result->inst_c0 = OP_FMUL; + result->inst_c1 = MONO_TYPE_R4; + return result; +#endif } + break; default: g_assert_not_reached (); } From 70c4d7ee2cc319e3e1eba14876fe17742b6671db Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 21 Feb 2024 23:58:14 +0100 Subject: [PATCH 05/15] Changed frsqrts codegen --- src/mono/mono/mini/mini-arm64.c | 3 +++ src/mono/mono/mini/mini-llvm.c | 9 --------- src/mono/mono/mini/simd-arm64.h | 2 -- src/mono/mono/mini/simd-intrinsics.c | 2 +- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 657aac2c79d8a..e522230d26065 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -4112,6 +4112,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case INTRINS_AARCH64_ADV_SIMD_USHL: arm_neon_ushl (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); break; + case INTRINS_AARCH64_ADV_SIMD_FRSQRTS: + arm_neon_frsqrts (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); + break; default: g_assert_not_reached (); break; diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 08bbc62300907..6ae9086ea3f7f 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -8486,15 +8486,6 @@ MONO_RESTORE_WARNING #endif break; } -#if defined(TARGET_ARM64) - case INTRINS_AARCH64_ADV_SIMD_FRSQRTS: { - IntrinsicId iid = (IntrinsicId) ins->inst_c0; - LLVMValueRef call_args [] = { l, r }; - llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass); - result = call_overloaded_intrins (ctx, iid, ovr_tag, call_args, ""); - break; - } -#endif default: g_assert_not_reached (); } diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h index 0eef05003e429..52c0741c85ace 100644 --- a/src/mono/mono/mini/simd-arm64.h +++ b/src/mono/mono/mini/simd-arm64.h @@ -38,7 +38,6 @@ SIMD_OP (64, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS, _UNDEF, SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fsqrt, _UNDEF) SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fabs, _UNDEF) SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRSQRTE,WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrte, _UNDEF) -SIMD_OP (64, OP_XBINOP, INTRINS_AARCH64_ADV_SIMD_FRSQRTS,WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrts, _UNDEF) /* 128-bit vectors */ /* Width Opcode Function Operand config I8 I16 I32 I64 F32 F64 */ @@ -94,4 +93,3 @@ SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, WTDS, _UNDEF, SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_ABS, WTDS, arm_neon_abs, arm_neon_abs, arm_neon_abs, arm_neon_abs, _UNDEF, _UNDEF) SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fabs, arm_neon_fabs) SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRSQRTE,WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrte, arm_neon_frsqrte) -SIMD_OP (128, OP_XBINOP, INTRINS_AARCH64_ADV_SIMD_FRSQRTS,WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrts, arm_neon_frsqrts) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 40b8b2510163d..9c535e2cec158 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1149,7 +1149,7 @@ emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg){ recip_sqrt_2->inst_c0 = OP_FMUL; recip_sqrt_2->inst_c1 = MONO_TYPE_R4; - corr = emit_simd_ins (cfg, klass, OP_XBINOP, sum->dreg, recip_sqrt_2->dreg); + corr = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X_X, sum->dreg, recip_sqrt_2->dreg); corr->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FRSQRTS; corr->inst_c1 = MONO_TYPE_R4; From f205632c386a1cda744b6ec7ad866401d69d7c07 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Fri, 23 Feb 2024 22:34:29 +0900 Subject: [PATCH 06/15] fixed whitespace changes --- src/mono/mono/mini/mini-llvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 6ae9086ea3f7f..835d77a214a81 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -8486,6 +8486,7 @@ MONO_RESTORE_WARNING #endif break; } + default: g_assert_not_reached (); } From 874448cf8d8dc9a43e4b9c784e3db0b90b1232fc Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Mon, 11 Mar 2024 15:39:45 +0100 Subject: [PATCH 07/15] Removed reciprocal sqrt estimation from normalize --- src/mono/mono/arch/arm64/arm64-codegen.h | 2 -- src/mono/mono/mini/mini-arm64.c | 3 --- src/mono/mono/mini/mini-llvm.c | 2 +- src/mono/mono/mini/simd-arm64.h | 2 -- src/mono/mono/mini/simd-intrinsics.c | 29 +++++------------------- 5 files changed, 7 insertions(+), 31 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index ddfa2f7736c16..f749f5be8eff1 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1270,7 +1270,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fabs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, 0b10 | (type), 0b01111, (rd), (rn)) #define arm_neon_fneg(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b01111, (rd), (rn)) #define arm_neon_fsqrt(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11111, (rd), (rn)) -#define arm_neon_frsqrte(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11101, (rd), (rn)) #define arm_neon_fcvtn(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn)) #define arm_neon_fcvtn2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn)) #define arm_neon_fcvtl(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn)) @@ -1846,7 +1845,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fcmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_fcmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_faddp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn), (rm)) -#define arm_neon_frsqrts(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, 0b10 | (type), 0b11111, (rd), (rn), (rm)) // Generalized macros for bitwise ops: // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index e522230d26065..657aac2c79d8a 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -4112,9 +4112,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case INTRINS_AARCH64_ADV_SIMD_USHL: arm_neon_ushl (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); break; - case INTRINS_AARCH64_ADV_SIMD_FRSQRTS: - arm_neon_frsqrts (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); - break; default: g_assert_not_reached (); break; diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 835d77a214a81..5db5828eaaa91 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -8486,7 +8486,7 @@ MONO_RESTORE_WARNING #endif break; } - + default: g_assert_not_reached (); } diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h index 52c0741c85ace..b9a5b9dc9f641 100644 --- a/src/mono/mono/mini/simd-arm64.h +++ b/src/mono/mono/mini/simd-arm64.h @@ -37,7 +37,6 @@ SIMD_OP (64, OP_XBINOP, OP_FDIV, WTDSS, _UNDEF, SIMD_OP (64, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, _SKIP, _UNDEF) SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fsqrt, _UNDEF) SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fabs, _UNDEF) -SIMD_OP (64, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRSQRTE,WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrte, _UNDEF) /* 128-bit vectors */ /* Width Opcode Function Operand config I8 I16 I32 I64 F32 F64 */ @@ -92,4 +91,3 @@ SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_SIMD_FLOOR, WTDS, _UNDEF, _U SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fsqrt, arm_neon_fsqrt) SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_ABS, WTDS, arm_neon_abs, arm_neon_abs, arm_neon_abs, arm_neon_abs, _UNDEF, _UNDEF) SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fabs, arm_neon_fabs) -SIMD_OP (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRSQRTE,WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_frsqrte, arm_neon_frsqrte) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 9c535e2cec158..10e822e91c20b 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1137,31 +1137,14 @@ emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg){ sum->inst_c1 = MONO_TYPE_R4; } - MonoInst *recip_sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); - recip_sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FRSQRTE; - recip_sqrt->inst_c1 = MONO_TYPE_R4; + MonoInst *sqrt_vec = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); + sqrt_vec->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; + sqrt_vec->inst_c1 = MONO_TYPE_R4; - - MonoInst *recip_sqrt_2, *corr; - - for (int i = 0; i < 2; i++) { - recip_sqrt_2 = emit_simd_ins (cfg, klass, OP_XBINOP, recip_sqrt->dreg, recip_sqrt->dreg); - recip_sqrt_2->inst_c0 = OP_FMUL; - recip_sqrt_2->inst_c1 = MONO_TYPE_R4; - - corr = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X_X, sum->dreg, recip_sqrt_2->dreg); - corr->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FRSQRTS; - corr->inst_c1 = MONO_TYPE_R4; - - recip_sqrt = emit_simd_ins (cfg, klass, OP_XBINOP, recip_sqrt->dreg, corr->dreg); - recip_sqrt->inst_c0 = OP_FMUL; - recip_sqrt->inst_c1 = MONO_TYPE_R4; - } - - MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, recip_sqrt->dreg); - normalized_vec->inst_c0 = OP_FMUL; + MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, sqrt_vec->dreg); + normalized_vec->inst_c0 = OP_FDIV; normalized_vec->inst_c1 = MONO_TYPE_R4; - + return normalized_vec; } #endif From 4e3f2b85f98de1cb8b0db97c6644ab52fdbe4224 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Tue, 12 Mar 2024 05:57:32 +0100 Subject: [PATCH 08/15] Extracted dot method into sepearate function --- src/mono/mono/mini/simd-intrinsics.c | 155 ++++++++++++--------------- 1 file changed, 69 insertions(+), 86 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 10e822e91c20b..25cf7cb0a9c8f 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1470,6 +1470,73 @@ emit_msb_shift_vector_constant (MonoCompile *cfg, MonoClass *arg_class, MonoType } #endif +static MonoInst* +emit_dot (MonoCompile *cfg, MonoClass *klass, MonoMethodSignature *fsig, MonoTypeEnum arg0_type, MonoInst **args) { + if (!is_element_type_primitive (fsig->params [0])) + return NULL; +#if defined(TARGET_WASM) + if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8)) + return NULL; +#elif defined(TARGET_ARM64) + if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8 || arg0_type == MONO_TYPE_I || arg0_type == MONO_TYPE_U)) + return NULL; +#endif + +#if defined(TARGET_ARM64) || defined(TARGET_WASM) + int instc0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; + MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc0, arg0_type, fsig, args); + return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply); +#elif defined(TARGET_AMD64) + int instc =-1; + if (type_enum_is_float (arg0_type)) { + if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) { + int mask_val = -1; + switch (arg0_type) { + case MONO_TYPE_R4: + instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPS : OP_SSE41_DPPS_IMM; + mask_val = 0xf1; // 0xf1 ... 0b11110001 + break; + case MONO_TYPE_R8: + instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPD : OP_SSE41_DPPD_IMM; + mask_val = 0x31; // 0x31 ... 0b00110001 + break; + default: + return NULL; + } + + MonoInst *dot; + if (COMPILE_LLVM (cfg)) { + int mask_reg = alloc_ireg (cfg); + MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val); + + dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot->sreg3 = mask_reg; + } else { + dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot->inst_c0 = mask_val; + } + return extract_first_element (cfg, klass, arg0_type, dot->dreg); + } else { + instc = OP_FMUL; + } + } else { + if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1) + return NULL; // We don't support sum vector for byte, sbyte types yet + + // FIXME: + if (!COMPILE_LLVM (cfg)) + return NULL; + + instc = OP_IMUL; + } + MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc, arg0_type, fsig, args); + + return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply); +#else + return NULL; +#endif +} + /* * Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512. * If the intrinsic is not supported for some reasons, return NULL, and fall back to the c# @@ -1845,70 +1912,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } } case SN_Dot: { - if (!is_element_type_primitive (fsig->params [0])) - return NULL; -#if defined(TARGET_WASM) - if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8)) - return NULL; -#elif defined(TARGET_ARM64) - if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8 || arg0_type == MONO_TYPE_I || arg0_type == MONO_TYPE_U)) - return NULL; -#endif - -#if defined(TARGET_ARM64) || defined(TARGET_WASM) - int instc0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; - MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc0, arg0_type, fsig, args); - return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply); -#elif defined(TARGET_AMD64) - int instc =-1; - if (type_enum_is_float (arg0_type)) { - if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) { - int mask_val = -1; - switch (arg0_type) { - case MONO_TYPE_R4: - instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPS : OP_SSE41_DPPS_IMM; - mask_val = 0xf1; // 0xf1 ... 0b11110001 - break; - case MONO_TYPE_R8: - instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPD : OP_SSE41_DPPD_IMM; - mask_val = 0x31; // 0x31 ... 0b00110001 - break; - default: - return NULL; - } - - MonoInst *dot; - if (COMPILE_LLVM (cfg)) { - int mask_reg = alloc_ireg (cfg); - MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val); - - dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); - dot->sreg3 = mask_reg; - } else { - dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); - dot->inst_c0 = mask_val; - } - - return extract_first_element (cfg, klass, arg0_type, dot->dreg); - } else { - instc = OP_FMUL; - } - } else { - if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1) - return NULL; // We don't support sum vector for byte, sbyte types yet - - // FIXME: - if (!COMPILE_LLVM (cfg)) - return NULL; - - instc = OP_IMUL; - } - MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc, arg0_type, fsig, args); - - return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply); -#else - return NULL; -#endif + return emit_dot (cfg, klass, fsig, arg0_type, args); } case SN_Equals: case SN_EqualsAll: @@ -3068,28 +3072,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f return emit_simd_ins_for_binary_op (cfg, klass, fsig, args, MONO_TYPE_R4, id); } case SN_Dot: { -#if defined(TARGET_ARM64) || defined(TARGET_WASM) - MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, MONO_TYPE_R4, fsig, args); - return emit_sum_vector (cfg, fsig->params [0], MONO_TYPE_R4, pairwise_multiply); -#elif defined(TARGET_AMD64) - if (!(mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41)) - return NULL; - - int mask_reg = alloc_ireg (cfg); - MONO_EMIT_NEW_ICONST (cfg, mask_reg, 0xf1); - MonoInst *dot = emit_simd_ins (cfg, klass, OP_SSE41_DPPS, args [0]->dreg, args [1]->dreg); - dot->sreg3 = mask_reg; - - MONO_INST_NEW (cfg, ins, OP_EXTRACT_R4); - ins->dreg = alloc_freg (cfg); - ins->sreg1 = dot->dreg; - ins->inst_c0 = 0; - ins->inst_c1 = MONO_TYPE_R4; - MONO_ADD_INS (cfg->cbb, ins); - return ins; -#else - return NULL; -#endif + return emit_dot (cfg, klass, fsig, MONO_TYPE_R4, args); } case SN_Negate: case SN_op_UnaryNegation: { From 646724ea6ea3022408c2a9b9bdf6539469464329 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Tue, 12 Mar 2024 16:17:43 +0100 Subject: [PATCH 09/15] Refactored code to use exposed dot function --- src/mono/mono/mini/simd-intrinsics.c | 177 +++++++++++++-------------- 1 file changed, 82 insertions(+), 95 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 25cf7cb0a9c8f..54386851bf12e 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -689,27 +689,6 @@ emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_t return ins; } } - -static MonoInst* -emit_sum_sqrt_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg) { - MonoInst *sum = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, arg->dreg, -1); - sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; - sum->inst_c1 = MONO_TYPE_R4; - - if (COMPILE_LLVM (cfg)) { - sum = emit_simd_ins (cfg, klass, OP_EXPAND_R4, sum->dreg, -1); - sum->inst_c1 = MONO_TYPE_R4; - } - - MonoInst* sum_sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); - sum_sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; - sum_sqrt->inst_c1 = MONO_TYPE_R4; - - MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sum_sqrt->dreg, -1); - ins->inst_c0 = 0; - ins->inst_c1 = MONO_TYPE_R4; - return ins; -} #endif #ifdef TARGET_WASM static MonoInst* emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_type, MonoInst *arg); @@ -1109,46 +1088,6 @@ emit_vector_insert_element ( return ins; } -#if defined(TARGET_ARM64) -static MonoInst* -emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoInst *arg){ - MonoInst *vec_squared = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, arg->dreg); - vec_squared->inst_c0 = OP_FMUL; - vec_squared->inst_c1 = MONO_TYPE_R4; - - const char *class_name = m_class_get_name (klass); - if (!strcmp ("Plane", class_name)) { - static float r4_0 = 0; - MonoInst *zero; - int zero_dreg = alloc_freg (cfg); - MONO_INST_NEW (cfg, zero, OP_R4CONST); - zero->inst_p0 = (void*)&r4_0; - zero->dreg = zero_dreg; - MONO_ADD_INS (cfg->cbb, zero); - vec_squared = emit_vector_insert_element (cfg, klass, vec_squared, MONO_TYPE_R4, zero, 3, FALSE); - } - - MonoInst *sum = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, vec_squared->dreg, -1); - sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; - sum->inst_c1 = MONO_TYPE_R4; - - if (COMPILE_LLVM (cfg)) { - sum = emit_simd_ins (cfg, klass, OP_EXPAND_R4, sum->dreg, -1); - sum->inst_c1 = MONO_TYPE_R4; - } - - MonoInst *sqrt_vec = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, sum->dreg, -1); - sqrt_vec->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; - sqrt_vec->inst_c1 = MONO_TYPE_R4; - - MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, sqrt_vec->dreg); - normalized_vec->inst_c0 = OP_FDIV; - normalized_vec->inst_c1 = MONO_TYPE_R4; - - return normalized_vec; -} -#endif - static MonoInst * emit_vector_create_elementwise ( MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, @@ -1471,8 +1410,8 @@ emit_msb_shift_vector_constant (MonoCompile *cfg, MonoClass *arg_class, MonoType #endif static MonoInst* -emit_dot (MonoCompile *cfg, MonoClass *klass, MonoMethodSignature *fsig, MonoTypeEnum arg0_type, MonoInst **args) { - if (!is_element_type_primitive (fsig->params [0])) +emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnum arg0_type, int sreg1, int sreg2) { + if (!is_element_type_primitive (vector_type)) return NULL; #if defined(TARGET_WASM) if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8)) @@ -1484,8 +1423,10 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoMethodSignature *fsig, MonoTyp #if defined(TARGET_ARM64) || defined(TARGET_WASM) int instc0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; - MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc0, arg0_type, fsig, args); - return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply); + MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2); + pairwise_multiply->inst_c0 = instc0; + pairwise_multiply->inst_c1 = arg0_type; + return emit_sum_vector (cfg, vector_type, arg0_type, pairwise_multiply); #elif defined(TARGET_AMD64) int instc =-1; if (type_enum_is_float (arg0_type)) { @@ -1509,10 +1450,10 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoMethodSignature *fsig, MonoTyp int mask_reg = alloc_ireg (cfg); MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val); - dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot = emit_simd_ins (cfg, klass, instc, sreg1, sreg2); dot->sreg3 = mask_reg; } else { - dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot = emit_simd_ins (cfg, klass, instc, sreg1, sreg2); dot->inst_c0 = mask_val; } return extract_first_element (cfg, klass, arg0_type, dot->dreg); @@ -1529,14 +1470,48 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoMethodSignature *fsig, MonoTyp instc = OP_IMUL; } - MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc, arg0_type, fsig, args); + MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2); + pairwise_multiply->inst_c0 = instc0; + pairwise_multiply->inst_c1 = arg0_type; - return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply); + return emit_sum_vector (cfg, vector_type, arg0_type, pairwise_multiply); #else return NULL; #endif } +#if defined(TARGET_ARM64) +static MonoInst* +emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoInst *arg){ + MonoInst* vec = arg; + const char *class_name = m_class_get_name (klass); + if (!strcmp ("Plane", class_name)) { + static float r4_0 = 0; + MonoInst *zero; + int zero_dreg = alloc_freg (cfg); + MONO_INST_NEW (cfg, zero, OP_R4CONST); + zero->inst_p0 = (void*)&r4_0; + zero->dreg = zero_dreg; + MONO_ADD_INS (cfg->cbb, zero); + vec = emit_vector_insert_element (cfg, klass, vec, MONO_TYPE_R4, zero, 3, FALSE); + } + + MonoInst *dot = emit_dot(cfg, klass, vector_type, MONO_TYPE_R4, vec->dreg, vec->dreg); + dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); + dot->inst_c1 = MONO_TYPE_R4; + + MonoInst *sqrt_vec = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1); + sqrt_vec->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; + sqrt_vec->inst_c1 = MONO_TYPE_R4; + + MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, sqrt_vec->dreg); + normalized_vec->inst_c0 = OP_FDIV; + normalized_vec->inst_c1 = MONO_TYPE_R4; + + return normalized_vec; +} +#endif + /* * Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512. * If the intrinsic is not supported for some reasons, return NULL, and fall back to the c# @@ -1912,7 +1887,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } } case SN_Dot: { - return emit_dot (cfg, klass, fsig, arg0_type, args); + return emit_dot (cfg, klass, fsig->params [0], arg0_type, args [0]->dreg, args [1]->dreg); } case SN_Equals: case SN_EqualsAll: @@ -3072,7 +3047,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f return emit_simd_ins_for_binary_op (cfg, klass, fsig, args, MONO_TYPE_R4, id); } case SN_Dot: { - return emit_dot (cfg, klass, fsig, MONO_TYPE_R4, args); + return emit_dot (cfg, klass, fsig->params [0], MONO_TYPE_R4, args [0]->dreg, args [1]->dreg); } case SN_Negate: case SN_op_UnaryNegation: { @@ -3179,15 +3154,24 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f case SN_DistanceSquared: { #if defined(TARGET_ARM64) MonoInst *diffs = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FSUB, MONO_TYPE_R4, fsig, args); - MonoInst *diffs_squared = emit_simd_ins (cfg, klass, OP_XBINOP, diffs->dreg, diffs->dreg); - diffs_squared->inst_c0 = OP_FMUL; - diffs_squared->inst_c1 = MONO_TYPE_R4; + MonoInst *dot = emit_dot(cfg, klass, fsig->params [0], MONO_TYPE_R4, diffs->dreg, diffs->dreg); switch (id) { - case SN_Distance: - return emit_sum_sqrt_vector_2_3_4 (cfg, klass, diffs_squared); + case SN_Distance: { + dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); + dot->inst_c1 = MONO_TYPE_R4; + + MonoInst *sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1); + sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; + sqrt->inst_c1 = MONO_TYPE_R4; + + MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_R4; + return ins; + } case SN_DistanceSquared: - return emit_sum_vector (cfg, fsig->params [0], MONO_TYPE_R4, diffs_squared); + return dot; default: g_assert_not_reached (); } @@ -3198,16 +3182,24 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f case SN_LengthSquared: { #if defined (TARGET_ARM64) int src1 = load_simd_vreg (cfg, cmethod, args [0], NULL); - - MonoInst *vec_squared = emit_simd_ins (cfg, klass, OP_XBINOP, src1, src1); - vec_squared->inst_c0 = OP_FMUL; - vec_squared->inst_c1 = MONO_TYPE_R4; + MonoInst *dot = emit_dot(cfg, klass, type, MONO_TYPE_R4, src1, src1); switch (id) { - case SN_Length: - return emit_sum_sqrt_vector_2_3_4 (cfg, klass, vec_squared); + case SN_Length: { + dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); + dot->inst_c1 = MONO_TYPE_R4; + + MonoInst *sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1); + sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; + sqrt->inst_c1 = MONO_TYPE_R4; + + MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_R4; + return ins; + } case SN_LengthSquared: - return emit_sum_vector (cfg, type, MONO_TYPE_R4, vec_squared); + return dot; default: g_assert_not_reached (); } @@ -3218,16 +3210,11 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f #if defined (TARGET_ARM64) MonoInst* v1 = args [1]; if (!strcmp ("Quaternion", m_class_get_name (klass))) { - MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, MONO_TYPE_R4, fsig, args); - pairwise_multiply->sreg3 = -1; - MonoInst *dot = emit_simd_ins (cfg, klass, OP_ARM64_XADDV, pairwise_multiply->dreg, -1); - dot->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; - dot->inst_c1 = MONO_TYPE_R4; + MonoInst *dot = emit_dot(cfg, klass, fsig->params [0], MONO_TYPE_R4, args [0]->dreg, args [1]->dreg); - if (COMPILE_LLVM (cfg)) { - dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); - dot->inst_c1 = MONO_TYPE_R4; - } + dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); + dot->inst_c0 = 0; + dot->inst_c1 = MONO_TYPE_R4; MonoInst* zeros = emit_xzero (cfg, klass); @@ -3254,7 +3241,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f result->inst_c1 = MONO_TYPE_R4; if (!strcmp ("Quaternion", m_class_get_name (klass))) { - return emit_normalize_vector_2_3_4 (cfg, klass, result); + return emit_normalize_vector_2_3_4 (cfg, klass, type, result); } return result; @@ -3263,7 +3250,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f break; case SN_Normalize: { #if defined (TARGET_ARM64) - return emit_normalize_vector_2_3_4 (cfg, klass, args[0]); + return emit_normalize_vector_2_3_4 (cfg, klass, type, args[0]); #endif } break; From 2b8a85c45e19b9c59243409d0be56d6ff7ed3900 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Tue, 12 Mar 2024 16:36:49 +0100 Subject: [PATCH 10/15] Fixed x64 build error --- src/mono/mono/mini/simd-intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 54386851bf12e..615cf17e429e7 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1421,8 +1421,8 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnu return NULL; #endif -#if defined(TARGET_ARM64) || defined(TARGET_WASM) int instc0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; +#if defined(TARGET_ARM64) || defined(TARGET_WASM) MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2); pairwise_multiply->inst_c0 = instc0; pairwise_multiply->inst_c1 = arg0_type; From 4a005d24760e64d624414d35d633385f1ff96cd2 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Tue, 12 Mar 2024 17:02:14 +0100 Subject: [PATCH 11/15] Fixed more build errors --- src/mono/mono/mini/simd-intrinsics.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 615cf17e429e7..4cb5578a6bdff 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1421,10 +1421,9 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnu return NULL; #endif - int instc0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; #if defined(TARGET_ARM64) || defined(TARGET_WASM) MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2); - pairwise_multiply->inst_c0 = instc0; + pairwise_multiply->inst_c0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; pairwise_multiply->inst_c1 = arg0_type; return emit_sum_vector (cfg, vector_type, arg0_type, pairwise_multiply); #elif defined(TARGET_AMD64) @@ -1471,7 +1470,7 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnu instc = OP_IMUL; } MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2); - pairwise_multiply->inst_c0 = instc0; + pairwise_multiply->inst_c0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL; pairwise_multiply->inst_c1 = arg0_type; return emit_sum_vector (cfg, vector_type, arg0_type, pairwise_multiply); @@ -3165,10 +3164,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; sqrt->inst_c1 = MONO_TYPE_R4; - MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1); - ins->inst_c0 = 0; - ins->inst_c1 = MONO_TYPE_R4; - return ins; + MonoInst *distance = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1); + distance->inst_c0 = 0; + distance->inst_c1 = MONO_TYPE_R4; + return distance; } case SN_DistanceSquared: return dot; @@ -3193,10 +3192,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; sqrt->inst_c1 = MONO_TYPE_R4; - MonoInst *ins = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1); - ins->inst_c0 = 0; - ins->inst_c1 = MONO_TYPE_R4; - return ins; + MonoInst *length = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1); + length->inst_c0 = 0; + length->inst_c1 = MONO_TYPE_R4; + return length; } case SN_LengthSquared: return dot; From f8f1557cd2258e234e8a81e12f266209f2e3d79e Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 13 Mar 2024 11:47:54 +0100 Subject: [PATCH 12/15] Removed intrinsics for methods not intrinsified on coreclr side --- src/mono/mono/mini/simd-intrinsics.c | 66 +++------------------------- 1 file changed, 5 insertions(+), 61 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 4cb5578a6bdff..e99da692d1280 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -3097,43 +3097,8 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f return NULL; #endif } - case SN_CopyTo: { -#if defined(TARGET_ARM64) - if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) { - MonoInst *index_ins; - int val_vreg, end_index_reg; - val_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL); - - if (fsig->param_count == 2) { - index_ins = args [2]; - } else { - EMIT_NEW_ICONST (cfg, index_ins, 0); - } - - MonoInst *ldelema_ins; - MonoInst *array_ins = args [1]; - - /* CopyTo () does complicated argument checks */ - mini_emit_bounds_check_offset (cfg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length), index_ins->dreg, "ArgumentOutOfRangeException", FALSE); - end_index_reg = alloc_ireg (cfg); - int len_reg = alloc_ireg (cfg); - MONO_EMIT_NEW_LOAD_MEMBASE_OP_FLAGS (cfg, OP_LOADI4_MEMBASE, len_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length), MONO_INST_INVARIANT_LOAD); - EMIT_NEW_BIALU (cfg, ins, OP_ISUB, end_index_reg, len_reg, index_ins->dreg); - MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, end_index_reg, len); - MONO_EMIT_NEW_COND_EXC (cfg, LT, "ArgumentException"); - - /* Load the array slice into the simd reg */ - ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type_internal (etype), array_ins, index_ins, FALSE, FALSE); - EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, val_vreg); - ins->klass = cmethod->klass; - return ins; - } else { - // CopyTo(Span) - // Not intrinsified on coreclr - return NULL; - } -#endif - } + case SN_CopyTo: + return NULL; break; case SN_Clamp: { if (!(!fsig->hasthis && fsig->param_count == 3 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type) && mono_metadata_type_equal (fsig->params [2], type))) @@ -3208,26 +3173,9 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f case SN_Lerp: { #if defined (TARGET_ARM64) MonoInst* v1 = args [1]; - if (!strcmp ("Quaternion", m_class_get_name (klass))) { - MonoInst *dot = emit_dot(cfg, klass, fsig->params [0], MONO_TYPE_R4, args [0]->dreg, args [1]->dreg); - - dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); - dot->inst_c0 = 0; - dot->inst_c1 = MONO_TYPE_R4; - - MonoInst* zeros = emit_xzero (cfg, klass); - - MonoInst* ge_0 = emit_simd_ins (cfg, klass, OP_XCOMPARE_FP, dot->dreg, zeros->dreg); - ge_0->inst_c0 = CMP_GE; - ge_0->inst_c1 = MONO_TYPE_R4; - - MonoInst* negated_v1 = emit_simd_ins (cfg, klass, OP_NEGATION, args [1]->dreg, -1); - negated_v1->inst_c1 = MONO_TYPE_R4; - - v1 = emit_simd_ins (cfg, klass, OP_BSL, ge_0->dreg, args [1]->dreg); - v1->sreg3 = negated_v1->dreg; - v1->inst_c1 = MONO_TYPE_R4; - } + if (!strcmp ("Quaternion", m_class_get_name (klass))) + return NULL; + MonoInst *diffs = emit_simd_ins (cfg, klass, OP_XBINOP, v1->dreg, args [0]->dreg); diffs->inst_c0 = OP_FSUB; @@ -3239,10 +3187,6 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f result->inst_c0 = OP_FADD; result->inst_c1 = MONO_TYPE_R4; - if (!strcmp ("Quaternion", m_class_get_name (klass))) { - return emit_normalize_vector_2_3_4 (cfg, klass, type, result); - } - return result; #endif } From 5bbe5c4c2fd3cdcdef5b92db2d216a8f42c22b50 Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Wed, 13 Mar 2024 15:52:50 +0100 Subject: [PATCH 13/15] Cleaned up code --- src/mono/mono/mini/simd-intrinsics.c | 59 ++++++++++++---------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index e99da692d1280..c0b62f2939778 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1479,38 +1479,6 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnu #endif } -#if defined(TARGET_ARM64) -static MonoInst* -emit_normalize_vector_2_3_4 (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoInst *arg){ - MonoInst* vec = arg; - const char *class_name = m_class_get_name (klass); - if (!strcmp ("Plane", class_name)) { - static float r4_0 = 0; - MonoInst *zero; - int zero_dreg = alloc_freg (cfg); - MONO_INST_NEW (cfg, zero, OP_R4CONST); - zero->inst_p0 = (void*)&r4_0; - zero->dreg = zero_dreg; - MONO_ADD_INS (cfg->cbb, zero); - vec = emit_vector_insert_element (cfg, klass, vec, MONO_TYPE_R4, zero, 3, FALSE); - } - - MonoInst *dot = emit_dot(cfg, klass, vector_type, MONO_TYPE_R4, vec->dreg, vec->dreg); - dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); - dot->inst_c1 = MONO_TYPE_R4; - - MonoInst *sqrt_vec = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1); - sqrt_vec->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; - sqrt_vec->inst_c1 = MONO_TYPE_R4; - - MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, arg->dreg, sqrt_vec->dreg); - normalized_vec->inst_c0 = OP_FDIV; - normalized_vec->inst_c1 = MONO_TYPE_R4; - - return normalized_vec; -} -#endif - /* * Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512. * If the intrinsic is not supported for some reasons, return NULL, and fall back to the c# @@ -3193,7 +3161,32 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f break; case SN_Normalize: { #if defined (TARGET_ARM64) - return emit_normalize_vector_2_3_4 (cfg, klass, type, args[0]); + MonoInst* vec = args[0]; + const char *class_name = m_class_get_name (klass); + if (!strcmp ("Plane", class_name)) { + static float r4_0 = 0; + MonoInst *zero; + int zero_dreg = alloc_freg (cfg); + MONO_INST_NEW (cfg, zero, OP_R4CONST); + zero->inst_p0 = (void*)&r4_0; + zero->dreg = zero_dreg; + MONO_ADD_INS (cfg->cbb, zero); + vec = emit_vector_insert_element (cfg, klass, vec, MONO_TYPE_R4, zero, 3, FALSE); + } + + MonoInst *dot = emit_dot(cfg, klass, type, MONO_TYPE_R4, vec->dreg, vec->dreg); + dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1); + dot->inst_c1 = MONO_TYPE_R4; + + MonoInst *sqrt_vec = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1); + sqrt_vec->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT; + sqrt_vec->inst_c1 = MONO_TYPE_R4; + + MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, sqrt_vec->dreg); + normalized_vec->inst_c0 = OP_FDIV; + normalized_vec->inst_c1 = MONO_TYPE_R4; + + return normalized_vec; #endif } break; From 1ebb378fb07651a740ec5a3b8674b71dc499fdfb Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Thu, 14 Mar 2024 09:11:15 +0100 Subject: [PATCH 14/15] Replaced break with return null --- src/mono/mono/mini/simd-intrinsics.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index c0b62f2939778..1d6a56676b0f0 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -3067,7 +3067,6 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f } case SN_CopyTo: return NULL; - break; case SN_Clamp: { if (!(!fsig->hasthis && fsig->param_count == 3 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type) && mono_metadata_type_equal (fsig->params [2], type))) return NULL; @@ -3107,9 +3106,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f default: g_assert_not_reached (); } +#else + return NULL; #endif } - break; case SN_Length: case SN_LengthSquared: { #if defined (TARGET_ARM64) @@ -3135,9 +3135,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f default: g_assert_not_reached (); } +#else + return NULL; #endif } - break; case SN_Lerp: { #if defined (TARGET_ARM64) MonoInst* v1 = args [1]; @@ -3156,9 +3157,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f result->inst_c1 = MONO_TYPE_R4; return result; +#else + return NULL; #endif } - break; case SN_Normalize: { #if defined (TARGET_ARM64) MonoInst* vec = args[0]; @@ -3187,9 +3189,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f normalized_vec->inst_c1 = MONO_TYPE_R4; return normalized_vec; +#else + return NULL; #endif } - break; case SN_Conjugate: { #if defined (TARGET_ARM64) float value[4]; @@ -3202,9 +3205,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f result->inst_c0 = OP_FMUL; result->inst_c1 = MONO_TYPE_R4; return result; +#else + return NULL; #endif } - break; default: g_assert_not_reached (); } From a134baeb376d85484cf2b0004181585f00c2585d Mon Sep 17 00:00:00 2001 From: Jeremi Kurdek Date: Fri, 15 Mar 2024 10:37:15 +0100 Subject: [PATCH 15/15] Removed trailing whitespaces --- src/mono/mono/mini/simd-intrinsics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index a8bbd82482b3a..e003d1247892c 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1424,7 +1424,7 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnu break; default: return NULL; - } + } MonoInst *dot; if (COMPILE_LLVM (cfg)) { @@ -1440,7 +1440,7 @@ emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnu return extract_first_element (cfg, klass, arg0_type, dot->dreg); } else { instc = OP_FMUL; - } + } } else { if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1) return NULL; // We don't support sum vector for byte, sbyte types yet