Skip to content

Commit 516f5c4

Browse files
authored
[Mono] [Arm64] Added SIMD support for vector 2/3/4 methods (#98761)
* [Mono] [Arm64] Added multiple vector instrinsics * Added LLVM support * fix build errors on x64 * Added Quaternion.Conjugate * Changed frsqrts codegen * fixed whitespace changes * Removed reciprocal sqrt estimation from normalize * Extracted dot method into sepearate function * Refactored code to use exposed dot function * Fixed x64 build error * Fixed more build errors * Removed intrinsics for methods not intrinsified on coreclr side * Cleaned up code * Replaced break with return null * Removed trailing whitespaces
1 parent 334020d commit 516f5c4

File tree

1 file changed

+199
-93
lines changed

1 file changed

+199
-93
lines changed

src/mono/mono/mini/simd-intrinsics.c

+199-93
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,7 @@ emit_xconst_v128 (MonoCompile *cfg, MonoClass *klass, guint8 value[16])
632632
ins->type = STACK_VTYPE;
633633
ins->dreg = alloc_xreg (cfg);
634634
ins->inst_p0 = mono_mem_manager_alloc (cfg->mem_manager, size);
635+
ins->klass = klass;
635636
MONO_ADD_INS (cfg->cbb, ins);
636637

637638
memcpy (ins->inst_p0, &value[0], size);
@@ -1390,6 +1391,76 @@ emit_msb_shift_vector_constant (MonoCompile *cfg, MonoClass *arg_class, MonoType
13901391
}
13911392
#endif
13921393

1394+
static MonoInst*
1395+
emit_dot (MonoCompile *cfg, MonoClass *klass, MonoType *vector_type, MonoTypeEnum arg0_type, int sreg1, int sreg2) {
1396+
if (!is_element_type_primitive (vector_type))
1397+
return NULL;
1398+
#if defined(TARGET_WASM)
1399+
if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8))
1400+
return NULL;
1401+
#elif defined(TARGET_ARM64)
1402+
if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8 || arg0_type == MONO_TYPE_I || arg0_type == MONO_TYPE_U))
1403+
return NULL;
1404+
#endif
1405+
1406+
#if defined(TARGET_ARM64) || defined(TARGET_WASM)
1407+
MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2);
1408+
pairwise_multiply->inst_c0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL;
1409+
pairwise_multiply->inst_c1 = arg0_type;
1410+
return emit_sum_vector (cfg, vector_type, arg0_type, pairwise_multiply);
1411+
#elif defined(TARGET_AMD64)
1412+
int instc =-1;
1413+
if (type_enum_is_float (arg0_type)) {
1414+
if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) {
1415+
int mask_val = -1;
1416+
switch (arg0_type) {
1417+
case MONO_TYPE_R4:
1418+
instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPS : OP_SSE41_DPPS_IMM;
1419+
mask_val = 0xf1; // 0xf1 ... 0b11110001
1420+
break;
1421+
case MONO_TYPE_R8:
1422+
instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPD : OP_SSE41_DPPD_IMM;
1423+
mask_val = 0x31; // 0x31 ... 0b00110001
1424+
break;
1425+
default:
1426+
return NULL;
1427+
}
1428+
1429+
MonoInst *dot;
1430+
if (COMPILE_LLVM (cfg)) {
1431+
int mask_reg = alloc_ireg (cfg);
1432+
MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val);
1433+
1434+
dot = emit_simd_ins (cfg, klass, instc, sreg1, sreg2);
1435+
dot->sreg3 = mask_reg;
1436+
} else {
1437+
dot = emit_simd_ins (cfg, klass, instc, sreg1, sreg2);
1438+
dot->inst_c0 = mask_val;
1439+
}
1440+
return extract_first_element (cfg, klass, arg0_type, dot->dreg);
1441+
} else {
1442+
instc = OP_FMUL;
1443+
}
1444+
} else {
1445+
if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)
1446+
return NULL; // We don't support sum vector for byte, sbyte types yet
1447+
1448+
// FIXME:
1449+
if (!COMPILE_LLVM (cfg))
1450+
return NULL;
1451+
1452+
instc = OP_IMUL;
1453+
}
1454+
MonoInst *pairwise_multiply = emit_simd_ins (cfg, klass, OP_XBINOP, sreg1, sreg2);
1455+
pairwise_multiply->inst_c0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL;
1456+
pairwise_multiply->inst_c1 = arg0_type;
1457+
1458+
return emit_sum_vector (cfg, vector_type, arg0_type, pairwise_multiply);
1459+
#else
1460+
return NULL;
1461+
#endif
1462+
}
1463+
13931464
/*
13941465
* Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512.
13951466
* If the intrinsic is not supported for some reasons, return NULL, and fall back to the c#
@@ -1768,70 +1839,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
17681839
}
17691840
}
17701841
case SN_Dot: {
1771-
if (!is_element_type_primitive (fsig->params [0]))
1772-
return NULL;
1773-
#if defined(TARGET_WASM)
1774-
if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8))
1775-
return NULL;
1776-
#elif defined(TARGET_ARM64)
1777-
if (!COMPILE_LLVM (cfg) && (arg0_type == MONO_TYPE_I8 || arg0_type == MONO_TYPE_U8 || arg0_type == MONO_TYPE_I || arg0_type == MONO_TYPE_U))
1778-
return NULL;
1779-
#endif
1780-
1781-
#if defined(TARGET_ARM64) || defined(TARGET_WASM)
1782-
int instc0 = type_enum_is_float (arg0_type) ? OP_FMUL : OP_IMUL;
1783-
MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc0, arg0_type, fsig, args);
1784-
return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply);
1785-
#elif defined(TARGET_AMD64)
1786-
int instc =-1;
1787-
if (type_enum_is_float (arg0_type)) {
1788-
if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) {
1789-
int mask_val = -1;
1790-
switch (arg0_type) {
1791-
case MONO_TYPE_R4:
1792-
instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPS : OP_SSE41_DPPS_IMM;
1793-
mask_val = 0xf1; // 0xf1 ... 0b11110001
1794-
break;
1795-
case MONO_TYPE_R8:
1796-
instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPD : OP_SSE41_DPPD_IMM;
1797-
mask_val = 0x31; // 0x31 ... 0b00110001
1798-
break;
1799-
default:
1800-
return NULL;
1801-
}
1802-
1803-
MonoInst *dot;
1804-
if (COMPILE_LLVM (cfg)) {
1805-
int mask_reg = alloc_ireg (cfg);
1806-
MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val);
1807-
1808-
dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg);
1809-
dot->sreg3 = mask_reg;
1810-
} else {
1811-
dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg);
1812-
dot->inst_c0 = mask_val;
1813-
}
1814-
1815-
return extract_first_element (cfg, klass, arg0_type, dot->dreg);
1816-
} else {
1817-
instc = OP_FMUL;
1818-
}
1819-
} else {
1820-
if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)
1821-
return NULL; // We don't support sum vector for byte, sbyte types yet
1822-
1823-
// FIXME:
1824-
if (!COMPILE_LLVM (cfg))
1825-
return NULL;
1826-
1827-
instc = OP_IMUL;
1828-
}
1829-
MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc, arg0_type, fsig, args);
1830-
1831-
return emit_sum_vector (cfg, fsig->params [0], arg0_type, pairwise_multiply);
1832-
#else
1833-
return NULL;
1834-
#endif
1842+
return emit_dot (cfg, klass, fsig->params [0], arg0_type, args [0]->dreg, args [1]->dreg);
18351843
}
18361844
case SN_Equals:
18371845
case SN_EqualsAll:
@@ -2910,6 +2918,8 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
29102918
value [1] = 1.0f;
29112919
value [2] = 1.0f;
29122920
value [3] = 1.0f;
2921+
if (len == 3)
2922+
value [3] = 0.0f;
29132923
return emit_xconst_v128 (cfg, klass, (guint8*)value);
29142924
}
29152925
case SN_set_Item: {
@@ -2988,28 +2998,7 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
29882998
return emit_simd_ins_for_binary_op (cfg, klass, fsig, args, MONO_TYPE_R4, id);
29892999
}
29903000
case SN_Dot: {
2991-
#if defined(TARGET_ARM64) || defined(TARGET_WASM)
2992-
MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, MONO_TYPE_R4, fsig, args);
2993-
return emit_sum_vector (cfg, fsig->params [0], MONO_TYPE_R4, pairwise_multiply);
2994-
#elif defined(TARGET_AMD64)
2995-
if (!(mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41))
2996-
return NULL;
2997-
2998-
int mask_reg = alloc_ireg (cfg);
2999-
MONO_EMIT_NEW_ICONST (cfg, mask_reg, 0xf1);
3000-
MonoInst *dot = emit_simd_ins (cfg, klass, OP_SSE41_DPPS, args [0]->dreg, args [1]->dreg);
3001-
dot->sreg3 = mask_reg;
3002-
3003-
MONO_INST_NEW (cfg, ins, OP_EXTRACT_R4);
3004-
ins->dreg = alloc_freg (cfg);
3005-
ins->sreg1 = dot->dreg;
3006-
ins->inst_c0 = 0;
3007-
ins->inst_c1 = MONO_TYPE_R4;
3008-
MONO_ADD_INS (cfg->cbb, ins);
3009-
return ins;
3010-
#else
3011-
return NULL;
3012-
#endif
3001+
return emit_dot (cfg, klass, fsig->params [0], MONO_TYPE_R4, args [0]->dreg, args [1]->dreg);
30133002
}
30143003
case SN_Negate:
30153004
case SN_op_UnaryNegation: {
@@ -3061,7 +3050,6 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
30613050
#endif
30623051
}
30633052
case SN_CopyTo:
3064-
// FIXME: https://github.com/dotnet/runtime/issues/91394
30653053
return NULL;
30663054
case SN_Clamp: {
30673055
if (!(!fsig->hasthis && fsig->param_count == 3 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type) && mono_metadata_type_equal (fsig->params [2], type)))
@@ -3077,15 +3065,133 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
30773065

30783066
return min;
30793067
}
3080-
case SN_Conjugate:
3081-
case SN_Distance:
3082-
case SN_DistanceSquared:
3068+
case SN_Distance:
3069+
case SN_DistanceSquared: {
3070+
#if defined(TARGET_ARM64)
3071+
MonoInst *diffs = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FSUB, MONO_TYPE_R4, fsig, args);
3072+
MonoInst *dot = emit_dot(cfg, klass, fsig->params [0], MONO_TYPE_R4, diffs->dreg, diffs->dreg);
3073+
3074+
switch (id) {
3075+
case SN_Distance: {
3076+
dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1);
3077+
dot->inst_c1 = MONO_TYPE_R4;
3078+
3079+
MonoInst *sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1);
3080+
sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT;
3081+
sqrt->inst_c1 = MONO_TYPE_R4;
3082+
3083+
MonoInst *distance = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1);
3084+
distance->inst_c0 = 0;
3085+
distance->inst_c1 = MONO_TYPE_R4;
3086+
return distance;
3087+
}
3088+
case SN_DistanceSquared:
3089+
return dot;
3090+
default:
3091+
g_assert_not_reached ();
3092+
}
3093+
#else
3094+
return NULL;
3095+
#endif
3096+
}
30833097
case SN_Length:
3084-
case SN_LengthSquared:
3085-
case SN_Lerp:
3098+
case SN_LengthSquared: {
3099+
#if defined (TARGET_ARM64)
3100+
int src1 = load_simd_vreg (cfg, cmethod, args [0], NULL);
3101+
MonoInst *dot = emit_dot(cfg, klass, type, MONO_TYPE_R4, src1, src1);
3102+
3103+
switch (id) {
3104+
case SN_Length: {
3105+
dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1);
3106+
dot->inst_c1 = MONO_TYPE_R4;
3107+
3108+
MonoInst *sqrt = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1);
3109+
sqrt->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT;
3110+
sqrt->inst_c1 = MONO_TYPE_R4;
3111+
3112+
MonoInst *length = emit_simd_ins (cfg, klass, OP_EXTRACT_R4, sqrt->dreg, -1);
3113+
length->inst_c0 = 0;
3114+
length->inst_c1 = MONO_TYPE_R4;
3115+
return length;
3116+
}
3117+
case SN_LengthSquared:
3118+
return dot;
3119+
default:
3120+
g_assert_not_reached ();
3121+
}
3122+
#else
3123+
return NULL;
3124+
#endif
3125+
}
3126+
case SN_Lerp: {
3127+
#if defined (TARGET_ARM64)
3128+
MonoInst* v1 = args [1];
3129+
if (!strcmp ("Quaternion", m_class_get_name (klass)))
3130+
return NULL;
3131+
3132+
3133+
MonoInst *diffs = emit_simd_ins (cfg, klass, OP_XBINOP, v1->dreg, args [0]->dreg);
3134+
diffs->inst_c0 = OP_FSUB;
3135+
diffs->inst_c1 = MONO_TYPE_R4;
3136+
3137+
MonoInst *scaled_diffs = handle_mul_div_by_scalar (cfg, klass, MONO_TYPE_R4, args [2]->dreg, diffs->dreg, OP_FMUL);
3138+
3139+
MonoInst *result = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, scaled_diffs->dreg);
3140+
result->inst_c0 = OP_FADD;
3141+
result->inst_c1 = MONO_TYPE_R4;
3142+
3143+
return result;
3144+
#else
3145+
return NULL;
3146+
#endif
3147+
}
30863148
case SN_Normalize: {
3087-
// FIXME: https://github.com/dotnet/runtime/issues/91394
3149+
#if defined (TARGET_ARM64)
3150+
MonoInst* vec = args[0];
3151+
const char *class_name = m_class_get_name (klass);
3152+
if (!strcmp ("Plane", class_name)) {
3153+
static float r4_0 = 0;
3154+
MonoInst *zero;
3155+
int zero_dreg = alloc_freg (cfg);
3156+
MONO_INST_NEW (cfg, zero, OP_R4CONST);
3157+
zero->inst_p0 = (void*)&r4_0;
3158+
zero->dreg = zero_dreg;
3159+
MONO_ADD_INS (cfg->cbb, zero);
3160+
vec = emit_vector_insert_element (cfg, klass, vec, MONO_TYPE_R4, zero, 3, FALSE);
3161+
}
3162+
3163+
MonoInst *dot = emit_dot(cfg, klass, type, MONO_TYPE_R4, vec->dreg, vec->dreg);
3164+
dot = emit_simd_ins (cfg, klass, OP_EXPAND_R4, dot->dreg, -1);
3165+
dot->inst_c1 = MONO_TYPE_R4;
3166+
3167+
MonoInst *sqrt_vec = emit_simd_ins (cfg, klass, OP_XOP_OVR_X_X, dot->dreg, -1);
3168+
sqrt_vec->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FSQRT;
3169+
sqrt_vec->inst_c1 = MONO_TYPE_R4;
3170+
3171+
MonoInst *normalized_vec = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, sqrt_vec->dreg);
3172+
normalized_vec->inst_c0 = OP_FDIV;
3173+
normalized_vec->inst_c1 = MONO_TYPE_R4;
3174+
3175+
return normalized_vec;
3176+
#else
30883177
return NULL;
3178+
#endif
3179+
}
3180+
case SN_Conjugate: {
3181+
#if defined (TARGET_ARM64)
3182+
float value[4];
3183+
value [0] = -1.0f;
3184+
value [1] = -1.0f;
3185+
value [2] = -1.0f;
3186+
value [3] = 1.0f;
3187+
MonoInst* r = emit_xconst_v128 (cfg, klass, (guint8*)value);
3188+
MonoInst* result = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, r->dreg);
3189+
result->inst_c0 = OP_FMUL;
3190+
result->inst_c1 = MONO_TYPE_R4;
3191+
return result;
3192+
#else
3193+
return NULL;
3194+
#endif
30893195
}
30903196
default:
30913197
g_assert_not_reached ();

0 commit comments

Comments
 (0)