From a974719dc220818801bfcb84d25f6a37986014f3 Mon Sep 17 00:00:00 2001 From: Jan Dupej <109523496+jandupej@users.noreply.github.com> Date: Fri, 8 Sep 2023 10:54:51 +0200 Subject: [PATCH] [mono][jit] Enable all X86Base intrinsics (#91393) * Moving towards DivRem intrinsic. * DivRem for x64. * Typos. * Fixed intrinsic name. * Fix. * x86Base intrinsics in mini and LLVM. * Assert on unconsumed remainder in fused DivRem. * Reenable x86base tests. * Fixed narrowing assignment. * Documented the divrem operation pair. --- src/mono/mono/arch/amd64/amd64-codegen.h | 3 ++ src/mono/mono/arch/x86/x86-codegen.h | 3 ++ src/mono/mono/mini/cpu-amd64.mdesc | 10 ++++++ src/mono/mono/mini/cpu-x86.mdesc | 7 ++++ src/mono/mono/mini/mini-amd64.c | 34 ++++++++++++++++++ src/mono/mono/mini/mini-llvm.c | 41 +++++++++++++++++++++ src/mono/mono/mini/mini-ops.h | 22 ++++++++++++ src/mono/mono/mini/mini-x86.c | 19 ++++++++++ src/mono/mono/mini/simd-intrinsics.c | 46 +++++++++++++++++++++++- src/mono/mono/mini/simd-methods.h | 1 + src/tests/issues.targets | 9 ----- 11 files changed, 185 insertions(+), 10 deletions(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 304ff3cb74377..3d4d3845b3df3 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -1206,6 +1206,9 @@ typedef union { #define amd64_movsb_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsb(inst); amd64_codegen_post(inst); } while (0) #define amd64_movsl_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsl(inst); amd64_codegen_post(inst); } while (0) #define amd64_movsd_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsd(inst); amd64_codegen_post(inst); } while (0) +#define amd64_bsf_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_bsf ((inst),(dreg)&0x7,(reg)&0x7); amd64_codegen_post (inst); } while (0) +#define amd64_bsr_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_bsr ((inst),(dreg)&0x7,(reg)&0x7); amd64_codegen_post (inst); } while (0) + #define amd64_prefix_size(inst,p,size) do { x86_prefix((inst), p); } while (0) #define amd64_rdtsc_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_rdtsc(inst); amd64_codegen_post(inst); } while (0) #define amd64_cmpxchg_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_cmpxchg_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0) diff --git a/src/mono/mono/arch/x86/x86-codegen.h b/src/mono/mono/arch/x86/x86-codegen.h index aca2b659ca058..69372e8972479 100644 --- a/src/mono/mono/arch/x86/x86-codegen.h +++ b/src/mono/mono/arch/x86/x86-codegen.h @@ -1963,6 +1963,9 @@ mono_x86_patch_inline (guchar* code, gpointer target) #define x86_leave(inst) do { x86_byte (inst, 0xc9); } while (0) #define x86_sahf(inst) do { x86_byte (inst, 0x9e); } while (0) +#define x86_bsf(inst,dreg,reg) do { x86_byte (inst, 0x0f); x86_byte (inst, 0xbc); x86_reg_emit ((inst), (dreg), (reg)); } while (0) +#define x86_bsr(inst,dreg,reg) do { x86_byte (inst, 0x0f); x86_byte (inst, 0xbd); x86_reg_emit ((inst), (dreg), (reg)); } while (0) + #define x86_fsin(inst) do { x86_codegen_pre(&(inst), 2); x86_byte (inst, 0xd9); x86_byte (inst, 0xfe); } while (0) #define x86_fcos(inst) do { x86_codegen_pre(&(inst), 2); x86_byte (inst, 0xd9); x86_byte (inst, 0xff); } while (0) #define x86_fabs(inst) do { x86_codegen_pre(&(inst), 2); x86_byte (inst, 0xd9); x86_byte (inst, 0xe1); } while (0) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index 06321f83169e2..e0973099b59b8 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -98,6 +98,9 @@ long_div: dest:a src1:a src2:i len:16 clob:d long_div_un: dest:a src1:a src2:i len:16 clob:d long_rem: dest:d src1:a src2:i len:16 clob:a long_rem_un: dest:d src1:a src2:i len:16 clob:a +long_divrem: dest:a src1:a src2:d src3:i len:16 clob:x +long_divrem_un: dest:a src1:a src2:d src3:i len:16 clob:x +long_divrem2: dest:d len:3 long_and: dest:i src1:i src2:i len:3 clob:1 long_or: dest:i src1:i src2:i len:3 clob:1 long_xor: dest:i src1:i src2:i len:3 clob:1 @@ -337,6 +340,10 @@ amd64_lea_membase: dest:i src1:i len:11 x86_xchg: src1:i src2:i clob:x len:2 x86_fpop: src1:f len:3 x86_seteq_membase: src1:b len:9 +x86_bsf32: dest:i src1:i len:4 +x86_bsf64: dest:l src1:l len:4 +x86_bsr32: dest:i src1:i len:4 +x86_bsr64: dest:l src1:l len:4 x86_add_reg_membase: dest:i src1:i src2:b clob:1 len:13 x86_sub_reg_membase: dest:i src1:i src2:b clob:1 len:13 @@ -411,6 +418,9 @@ int_div: dest:a src1:a src2:i clob:d len:32 int_div_un: dest:a src1:a src2:i clob:d len:32 int_rem: dest:d src1:a src2:i clob:a len:32 int_rem_un: dest:d src1:a src2:i clob:a len:32 +int_divrem: dest:a src1:a src2:d src3:i clob:x len:15 +int_divrem_un: dest:a src1:a src2:d src3:i clob:x len:15 +int_divrem2: dest:d len:3 int_and: dest:i src1:i src2:i clob:1 len:4 int_or: dest:i src1:i src2:i clob:1 len:4 int_xor: dest:i src1:i src2:i clob:1 len:4 diff --git a/src/mono/mono/mini/cpu-x86.mdesc b/src/mono/mono/mini/cpu-x86.mdesc index e68d613872433..2e82fc3f6024c 100644 --- a/src/mono/mono/mini/cpu-x86.mdesc +++ b/src/mono/mono/mini/cpu-x86.mdesc @@ -111,6 +111,9 @@ int_div: dest:a src1:a src2:i len:15 clob:d int_div_un: dest:a src1:a src2:i len:15 clob:d int_rem: dest:d src1:a src2:i len:15 clob:a int_rem_un: dest:d src1:a src2:i len:15 clob:a +int_divrem: dest:a src1:a src2:d src3:i clob:x len:15 +int_divrem_un: dest:a src1:a src2:d src3:i clob:x len:15 +int_divrem2: dest:d len:3 int_and: dest:i src1:i src2:i clob:1 len:2 int_or: dest:i src1:i src2:i clob:1 len:2 int_xor: dest:i src1:i src2:i clob:1 len:2 @@ -303,6 +306,10 @@ x86_fp_load_i8: dest:f src1:b len:7 x86_fp_load_i4: dest:f src1:b len:7 x86_seteq_membase: src1:b len:7 x86_setne_membase: src1:b len:7 +x86_bsf32: dest:i src1:i len:4 +x86_bsf64: dest:l src1:l len:4 +x86_bsr32: dest:i src1:i len:4 +x86_bsr64: dest:l src1:l len:4 x86_add_reg_membase: dest:i src1:i src2:b clob:1 len:11 x86_sub_reg_membase: dest:i src1:i src2:b clob:1 len:11 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 2a87c1bf4bbb5..01c6216386ebf 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -5126,6 +5126,27 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_div_reg_size (code, ins->sreg2, FALSE, 4); } break; + case OP_X86_LDIVREM: + amd64_div_reg (code, ins->sreg3, TRUE); + break; + case OP_X86_IDIVREM: + amd64_div_reg_size (code, ins->sreg3, TRUE, 4); + break; + case OP_X86_LDIVREMU: + amd64_div_reg (code, ins->sreg3, FALSE); + break; + case OP_X86_IDIVREMU: + amd64_div_reg_size (code, ins->sreg3, FALSE, 4); + break; + case OP_X86_IDIVREM2: + if (ins->dreg != AMD64_RDX) + amd64_mov_reg_reg (code, ins->dreg, AMD64_RDX, 4); + break; + case OP_X86_LDIVREM2: + if (ins->dreg != AMD64_RDX) + amd64_mov_reg_reg (code, ins->dreg, AMD64_RDX, 8); + break; + case OP_LMUL_OVF: amd64_imul_reg_reg (code, ins->sreg1, ins->sreg2); EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException"); @@ -5685,6 +5706,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_X86_XCHG: amd64_xchg_reg_reg (code, ins->sreg1, ins->sreg2, 4); break; + case OP_X86_BSF32: + amd64_bsf_size (code, ins->dreg, ins->sreg1, 4); + break; + case OP_X86_BSF64: + amd64_bsf_size (code, ins->dreg, ins->sreg1, 8); + break; + case OP_X86_BSR32: + amd64_bsr_size (code, ins->dreg, ins->sreg1, 4); + break; + case OP_X86_BSR64: + amd64_bsr_size (code, ins->dreg, ins->sreg1, 8); + break; + case OP_LOCALLOC: /* keep alignment */ amd64_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_FRAME_ALIGNMENT - 1); diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index bbd8b38da1c4e..8761fbb1243f0 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -5640,6 +5640,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) LLVMBuilderRef builder; gboolean has_terminator; LLVMValueRef lhs, rhs, arg3; + LLVMValueRef last_divrem = NULL; int nins = 0; cbb = get_end_bb (ctx, bb); @@ -6638,6 +6639,44 @@ MONO_RESTORE_WARNING values [ins->dreg] = LLVMBuildXor (builder, tz, width, dname); break; } + case OP_X86_IDIVREM: + case OP_X86_LDIVREM: { + const LLVMTypeRef part_type = ins->opcode==OP_X86_IDIVREM ? LLVMInt32Type () : LLVMInt64Type (); + const LLVMTypeRef full_type = ins->opcode==OP_X86_IDIVREM ? LLVMInt64Type () : LLVMInt128Type (); + const LLVMValueRef shift_amount = ins->opcode==OP_X86_IDIVREM ? const_int32 (32) : const_int32 (64); + + LLVMValueRef dividend_low = LLVMBuildZExt (builder, convert (ctx, lhs, part_type), full_type, ""); + LLVMValueRef dividend_high = LLVMBuildSExt (builder, convert (ctx, rhs, part_type), full_type, ""); + LLVMValueRef dividend = LLVMBuildOr (builder, dividend_low, + LLVMBuildShl (builder, dividend_high, shift_amount, ""), ""); + LLVMValueRef divisor = LLVMBuildSExt (builder, convert (ctx, arg3, part_type), full_type, ""); + // LLVM should fuse the individual Div and Rem instructions into one DIV/IDIV on x86 + values [ins->dreg] = LLVMBuildTrunc (builder, LLVMBuildSDiv (builder, dividend, divisor, ""), part_type, ""); + last_divrem = LLVMBuildTrunc (builder, LLVMBuildSRem (builder, dividend, divisor, ""), part_type, ""); + break; + } + case OP_X86_IDIVREMU: + case OP_X86_LDIVREMU: { + const LLVMTypeRef part_type = ins->opcode==OP_X86_IDIVREMU ? LLVMInt32Type () : LLVMInt64Type (); + const LLVMTypeRef full_type = ins->opcode==OP_X86_IDIVREMU ? LLVMInt64Type () : LLVMInt128Type (); + const LLVMValueRef shift_amount = ins->opcode==OP_X86_IDIVREMU ? const_int32 (32) : const_int32 (64); + + LLVMValueRef dividend_low = LLVMBuildZExt (builder, convert (ctx, lhs, part_type), full_type, ""); + LLVMValueRef dividend_high = LLVMBuildZExt (builder, convert (ctx, rhs, part_type), full_type, ""); + LLVMValueRef dividend = LLVMBuildOr (builder, dividend_low, + LLVMBuildShl (builder, dividend_high, shift_amount, ""), ""); + LLVMValueRef divisor = LLVMBuildZExt (builder, convert (ctx, arg3, part_type), full_type, ""); + values [ins->dreg] = LLVMBuildTrunc (builder, LLVMBuildUDiv (builder, dividend, divisor, ""), part_type, ""); + last_divrem = LLVMBuildTrunc (builder, LLVMBuildURem (builder, dividend, divisor, ""), part_type, ""); + break; + } + case OP_X86_IDIVREM2: + case OP_X86_LDIVREM2: { + g_assert (last_divrem); + values [ins->dreg] = last_divrem; + last_divrem = NULL; + break; + } #endif case OP_ICONV_TO_I1: @@ -12043,6 +12082,8 @@ MONO_RESTORE_WARNING } } + g_assert (last_divrem == NULL); + if (!ctx_ok (ctx)) return; diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index d95f094807340..2813ae2035160 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1184,6 +1184,28 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG) #endif +#if defined(TARGET_X86) || defined(TARGET_AMD64) +/* + * These operations exist to facilitate simultaneous int/uint division + * and remainder on x86/x86-64. On that platform the DIV/IDIV instructions + * operate as follows edx:eax/reg32 -> (eax=quotient,edx=remainder). Mono + * ops only support one destination register, so two operations are needed + * to obtain two result values. One would use {long,int}_divrem[_un] first, + * and the corresponding {long_int}_divrem2 immediately afterwards. The + * first instruction returns the quotient and leaves the remainder in the + * edx(rdx) register. The second instruction puts a virtual register over + * edx, so that its value can be used. Note that if the first instruction + * is emitted, the second must be also (there is an assert). This works + * both in LLVM and mini. + */ +MINI_OP3(OP_X86_LDIVREM, "long_divrem", LREG, LREG, LREG, LREG) +MINI_OP3(OP_X86_LDIVREMU, "long_divrem_un", LREG, LREG, LREG, LREG) +MINI_OP3(OP_X86_LDIVREM2, "long_divrem2", LREG, NONE, NONE, NONE) +MINI_OP3(OP_X86_IDIVREM, "int_divrem", IREG, IREG, IREG, IREG) +MINI_OP3(OP_X86_IDIVREMU, "int_divrem_un", IREG, IREG, IREG, IREG) +MINI_OP3(OP_X86_IDIVREM2, "int_divrem2", IREG, NONE, NONE, NONE) +#endif + MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE) MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE) diff --git a/src/mono/mono/mini/mini-x86.c b/src/mono/mono/mini/mini-x86.c index d71ae67fa9d92..80116fb5497f3 100644 --- a/src/mono/mono/mini/mini-x86.c +++ b/src/mono/mono/mini/mini-x86.c @@ -2877,6 +2877,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + case OP_X86_LDIVREM: + case OP_X86_LDIVREMU: + case OP_X86_LDIVREM2: + g_assert_not_reached (); + break; + case OP_X86_IDIVREM: + case OP_X86_IDIVREMU: + x86_div_reg (code, ins->sreg3, ins->opcode==OP_X86_IDIVREM); + break; + case OP_X86_IDIVREM2: + if (ins->dreg != X86_EDX) + x86_mov_reg_reg (code, ins->dreg, X86_EDX); + break; case OP_IOR: x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2); break; @@ -3309,6 +3322,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_X86_XCHG: x86_xchg_reg_reg (code, ins->sreg1, ins->sreg2, 4); break; + case OP_X86_BSF32: + x86_bsf (code, ins->dreg, ins->sreg1); + break; + case OP_X86_BSR32: + x86_bsr (code, ins->dreg, ins->sreg1); + break; case OP_LOCALLOC: /* keep alignment */ x86_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_LOCALLOC_ALIGNMENT - 1); diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 8af7db24beb57..a6dfaedf92cc2 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -4599,6 +4599,7 @@ static SimdIntrinsic bmi2_methods [] = { static SimdIntrinsic x86base_methods [] = { {SN_BitScanForward}, {SN_BitScanReverse}, + {SN_DivRem}, {SN_Pause, OP_XOP, INTRINS_SSE_PAUSE}, {SN_get_IsSupported} }; @@ -4620,7 +4621,7 @@ static const IntrinGroup supported_x86_intrinsics [] = { { "Sse41", MONO_CPU_X86_SSE41, sse41_methods, sizeof (sse41_methods) }, { "Sse42", MONO_CPU_X86_SSE42, sse42_methods, sizeof (sse42_methods) }, { "Ssse3", MONO_CPU_X86_SSSE3, ssse3_methods, sizeof (ssse3_methods) }, - { "X86Base", 0, x86base_methods, sizeof (x86base_methods) }, + { "X86Base", MONO_CPU_INITED, x86base_methods, sizeof (x86base_methods), TRUE }, { "X86Serialize", 0, unsupported, sizeof (unsupported) }, }; @@ -5246,6 +5247,49 @@ emit_x86_intrinsics ( ins->type = is_64bit ? STACK_I8 : STACK_I4; MONO_ADD_INS (cfg->cbb, ins); return ins; + case SN_DivRem: { + g_assert (!(TARGET_SIZEOF_VOID_P == 4 && is_64bit)); // x86(no -64) cannot do divisions with 64-bit regs + const MonoStackType divtype = is_64bit ? STACK_I8 : STACK_I4; + const int storetype = is_64bit ? OP_STOREI8_MEMBASE_REG : OP_STOREI4_MEMBASE_REG; + const int obj_size = MONO_ABI_SIZEOF (MonoObject); + + // We must decide by the second argument, the first is always unsigned here + MonoTypeEnum arg1_type = fsig->param_count > 1 ? get_underlying_type (fsig->params [1]) : MONO_TYPE_VOID; + MonoInst* div; + MonoInst* div2; + + if (type_enum_is_unsigned (arg1_type)) { + MONO_INST_NEW (cfg, div, is_64bit ? OP_X86_LDIVREMU : OP_X86_IDIVREMU); + } else { + MONO_INST_NEW (cfg, div, is_64bit ? OP_X86_LDIVREM : OP_X86_IDIVREM); + } + div->dreg = is_64bit ? alloc_lreg (cfg) : alloc_ireg (cfg); + div->sreg1 = args [0]->dreg; // we can use this directly, reg alloc knows that the contents will be destroyed + div->sreg2 = args [1]->dreg; // same here as ^ + div->sreg3 = args [2]->dreg; + div->type = divtype; + MONO_ADD_INS (cfg->cbb, div); + + // Protect the contents of edx/rdx by assigning it a vreg. The instruction must + // immediately follow DIV/IDIV so that edx content is not modified. + // In LLVM the remainder is already calculated, just need to capture it in a vreg. + MONO_INST_NEW (cfg, div2, is_64bit ? OP_X86_LDIVREM2 : OP_X86_IDIVREM2); + div2->dreg = is_64bit ? alloc_lreg (cfg) : alloc_ireg (cfg); + div2->type = divtype; + MONO_ADD_INS (cfg->cbb, div2); + + // TODO: Can the creation of tuple be elided? (e.g. if deconstruction is used) + MonoInst* tuple = mono_compile_create_var (cfg, fsig->ret, OP_LOCAL); + MonoInst* tuple_addr; + EMIT_NEW_TEMPLOADA (cfg, tuple_addr, tuple->inst_c0); + + MonoClassField* field1 = mono_class_get_field_from_name_full (tuple->klass, "Item1", NULL); + MONO_EMIT_NEW_STORE_MEMBASE (cfg, storetype, tuple_addr->dreg, field1->offset - obj_size, div->dreg); + MonoClassField* field2 = mono_class_get_field_from_name_full (tuple->klass, "Item2", NULL); + MONO_EMIT_NEW_STORE_MEMBASE (cfg, storetype, tuple_addr->dreg, field2->offset - obj_size, div2->dreg); + EMIT_NEW_TEMPLOAD (cfg, ins, tuple->inst_c0); + return ins; + } default: g_assert_not_reached (); } diff --git a/src/mono/mono/mini/simd-methods.h b/src/mono/mono/mini/simd-methods.h index 6f237c145c57e..e8f3074195501 100644 --- a/src/mono/mono/mini/simd-methods.h +++ b/src/mono/mono/mini/simd-methods.h @@ -293,6 +293,7 @@ METHOD(ComputeCrc32C) // X86Base METHOD(BitScanForward) METHOD(BitScanReverse) +METHOD(DivRem) METHOD(Pause) // Crypto METHOD(FixedRotate) diff --git a/src/tests/issues.targets b/src/tests/issues.targets index bba141d5b1f36..20ca6b33799ed 100644 --- a/src/tests/issues.targets +++ b/src/tests/issues.targets @@ -1213,15 +1213,6 @@ https://github.com/dotnet/runtime/issues/54185 - - https://github.com/dotnet/runtime/issues/75767 - - - https://github.com/dotnet/runtime/issues/75767 - - - https://github.com/dotnet/runtime/issues/75767 - Mono does not define out of range fp to int conversions