Skip to content

Commit

Permalink
[mono][jit] Enable all X86Base intrinsics (dotnet#91393)
Browse files Browse the repository at this point in the history
* Moving towards DivRem intrinsic.

* DivRem for x64.

* Typos.

* Fixed intrinsic name.

* Fix.

* x86Base intrinsics in mini and LLVM.

* Assert on unconsumed remainder in fused DivRem.

* Reenable x86base tests.

* Fixed narrowing assignment.

* Documented the divrem operation pair.
  • Loading branch information
jandupej authored Sep 8, 2023
1 parent 840e8fa commit a974719
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 10 deletions.
3 changes: 3 additions & 0 deletions src/mono/mono/arch/amd64/amd64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1206,6 +1206,9 @@ typedef union {
#define amd64_movsb_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsb(inst); amd64_codegen_post(inst); } while (0)
#define amd64_movsl_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsl(inst); amd64_codegen_post(inst); } while (0)
#define amd64_movsd_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_movsd(inst); amd64_codegen_post(inst); } while (0)
#define amd64_bsf_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_bsf ((inst),(dreg)&0x7,(reg)&0x7); amd64_codegen_post (inst); } while (0)
#define amd64_bsr_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_bsr ((inst),(dreg)&0x7,(reg)&0x7); amd64_codegen_post (inst); } while (0)

#define amd64_prefix_size(inst,p,size) do { x86_prefix((inst), p); } while (0)
#define amd64_rdtsc_size(inst,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),0,0,0); x86_rdtsc(inst); amd64_codegen_post(inst); } while (0)
#define amd64_cmpxchg_reg_reg_size(inst,dreg,reg,size) do { amd64_codegen_pre(inst); amd64_emit_rex ((inst),(size),(dreg),0,(reg)); x86_cmpxchg_reg_reg((inst),((dreg)&0x7),((reg)&0x7)); amd64_codegen_post(inst); } while (0)
Expand Down
3 changes: 3 additions & 0 deletions src/mono/mono/arch/x86/x86-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1963,6 +1963,9 @@ mono_x86_patch_inline (guchar* code, gpointer target)
#define x86_leave(inst) do { x86_byte (inst, 0xc9); } while (0)
#define x86_sahf(inst) do { x86_byte (inst, 0x9e); } while (0)

#define x86_bsf(inst,dreg,reg) do { x86_byte (inst, 0x0f); x86_byte (inst, 0xbc); x86_reg_emit ((inst), (dreg), (reg)); } while (0)
#define x86_bsr(inst,dreg,reg) do { x86_byte (inst, 0x0f); x86_byte (inst, 0xbd); x86_reg_emit ((inst), (dreg), (reg)); } while (0)

#define x86_fsin(inst) do { x86_codegen_pre(&(inst), 2); x86_byte (inst, 0xd9); x86_byte (inst, 0xfe); } while (0)
#define x86_fcos(inst) do { x86_codegen_pre(&(inst), 2); x86_byte (inst, 0xd9); x86_byte (inst, 0xff); } while (0)
#define x86_fabs(inst) do { x86_codegen_pre(&(inst), 2); x86_byte (inst, 0xd9); x86_byte (inst, 0xe1); } while (0)
Expand Down
10 changes: 10 additions & 0 deletions src/mono/mono/mini/cpu-amd64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ long_div: dest:a src1:a src2:i len:16 clob:d
long_div_un: dest:a src1:a src2:i len:16 clob:d
long_rem: dest:d src1:a src2:i len:16 clob:a
long_rem_un: dest:d src1:a src2:i len:16 clob:a
long_divrem: dest:a src1:a src2:d src3:i len:16 clob:x
long_divrem_un: dest:a src1:a src2:d src3:i len:16 clob:x
long_divrem2: dest:d len:3
long_and: dest:i src1:i src2:i len:3 clob:1
long_or: dest:i src1:i src2:i len:3 clob:1
long_xor: dest:i src1:i src2:i len:3 clob:1
Expand Down Expand Up @@ -337,6 +340,10 @@ amd64_lea_membase: dest:i src1:i len:11
x86_xchg: src1:i src2:i clob:x len:2
x86_fpop: src1:f len:3
x86_seteq_membase: src1:b len:9
x86_bsf32: dest:i src1:i len:4
x86_bsf64: dest:l src1:l len:4
x86_bsr32: dest:i src1:i len:4
x86_bsr64: dest:l src1:l len:4

x86_add_reg_membase: dest:i src1:i src2:b clob:1 len:13
x86_sub_reg_membase: dest:i src1:i src2:b clob:1 len:13
Expand Down Expand Up @@ -411,6 +418,9 @@ int_div: dest:a src1:a src2:i clob:d len:32
int_div_un: dest:a src1:a src2:i clob:d len:32
int_rem: dest:d src1:a src2:i clob:a len:32
int_rem_un: dest:d src1:a src2:i clob:a len:32
int_divrem: dest:a src1:a src2:d src3:i clob:x len:15
int_divrem_un: dest:a src1:a src2:d src3:i clob:x len:15
int_divrem2: dest:d len:3
int_and: dest:i src1:i src2:i clob:1 len:4
int_or: dest:i src1:i src2:i clob:1 len:4
int_xor: dest:i src1:i src2:i clob:1 len:4
Expand Down
7 changes: 7 additions & 0 deletions src/mono/mono/mini/cpu-x86.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ int_div: dest:a src1:a src2:i len:15 clob:d
int_div_un: dest:a src1:a src2:i len:15 clob:d
int_rem: dest:d src1:a src2:i len:15 clob:a
int_rem_un: dest:d src1:a src2:i len:15 clob:a
int_divrem: dest:a src1:a src2:d src3:i clob:x len:15
int_divrem_un: dest:a src1:a src2:d src3:i clob:x len:15
int_divrem2: dest:d len:3
int_and: dest:i src1:i src2:i clob:1 len:2
int_or: dest:i src1:i src2:i clob:1 len:2
int_xor: dest:i src1:i src2:i clob:1 len:2
Expand Down Expand Up @@ -303,6 +306,10 @@ x86_fp_load_i8: dest:f src1:b len:7
x86_fp_load_i4: dest:f src1:b len:7
x86_seteq_membase: src1:b len:7
x86_setne_membase: src1:b len:7
x86_bsf32: dest:i src1:i len:4
x86_bsf64: dest:l src1:l len:4
x86_bsr32: dest:i src1:i len:4
x86_bsr64: dest:l src1:l len:4

x86_add_reg_membase: dest:i src1:i src2:b clob:1 len:11
x86_sub_reg_membase: dest:i src1:i src2:b clob:1 len:11
Expand Down
34 changes: 34 additions & 0 deletions src/mono/mono/mini/mini-amd64.c
Original file line number Diff line number Diff line change
Expand Up @@ -5126,6 +5126,27 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
amd64_div_reg_size (code, ins->sreg2, FALSE, 4);
}
break;
case OP_X86_LDIVREM:
amd64_div_reg (code, ins->sreg3, TRUE);
break;
case OP_X86_IDIVREM:
amd64_div_reg_size (code, ins->sreg3, TRUE, 4);
break;
case OP_X86_LDIVREMU:
amd64_div_reg (code, ins->sreg3, FALSE);
break;
case OP_X86_IDIVREMU:
amd64_div_reg_size (code, ins->sreg3, FALSE, 4);
break;
case OP_X86_IDIVREM2:
if (ins->dreg != AMD64_RDX)
amd64_mov_reg_reg (code, ins->dreg, AMD64_RDX, 4);
break;
case OP_X86_LDIVREM2:
if (ins->dreg != AMD64_RDX)
amd64_mov_reg_reg (code, ins->dreg, AMD64_RDX, 8);
break;

case OP_LMUL_OVF:
amd64_imul_reg_reg (code, ins->sreg1, ins->sreg2);
EMIT_COND_SYSTEM_EXCEPTION (X86_CC_O, FALSE, "OverflowException");
Expand Down Expand Up @@ -5685,6 +5706,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_X86_XCHG:
amd64_xchg_reg_reg (code, ins->sreg1, ins->sreg2, 4);
break;
case OP_X86_BSF32:
amd64_bsf_size (code, ins->dreg, ins->sreg1, 4);
break;
case OP_X86_BSF64:
amd64_bsf_size (code, ins->dreg, ins->sreg1, 8);
break;
case OP_X86_BSR32:
amd64_bsr_size (code, ins->dreg, ins->sreg1, 4);
break;
case OP_X86_BSR64:
amd64_bsr_size (code, ins->dreg, ins->sreg1, 8);
break;

case OP_LOCALLOC:
/* keep alignment */
amd64_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_FRAME_ALIGNMENT - 1);
Expand Down
41 changes: 41 additions & 0 deletions src/mono/mono/mini/mini-llvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -5640,6 +5640,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
LLVMBuilderRef builder;
gboolean has_terminator;
LLVMValueRef lhs, rhs, arg3;
LLVMValueRef last_divrem = NULL;
int nins = 0;

cbb = get_end_bb (ctx, bb);
Expand Down Expand Up @@ -6638,6 +6639,44 @@ MONO_RESTORE_WARNING
values [ins->dreg] = LLVMBuildXor (builder, tz, width, dname);
break;
}
case OP_X86_IDIVREM:
case OP_X86_LDIVREM: {
const LLVMTypeRef part_type = ins->opcode==OP_X86_IDIVREM ? LLVMInt32Type () : LLVMInt64Type ();
const LLVMTypeRef full_type = ins->opcode==OP_X86_IDIVREM ? LLVMInt64Type () : LLVMInt128Type ();
const LLVMValueRef shift_amount = ins->opcode==OP_X86_IDIVREM ? const_int32 (32) : const_int32 (64);

LLVMValueRef dividend_low = LLVMBuildZExt (builder, convert (ctx, lhs, part_type), full_type, "");
LLVMValueRef dividend_high = LLVMBuildSExt (builder, convert (ctx, rhs, part_type), full_type, "");
LLVMValueRef dividend = LLVMBuildOr (builder, dividend_low,
LLVMBuildShl (builder, dividend_high, shift_amount, ""), "");
LLVMValueRef divisor = LLVMBuildSExt (builder, convert (ctx, arg3, part_type), full_type, "");
// LLVM should fuse the individual Div and Rem instructions into one DIV/IDIV on x86
values [ins->dreg] = LLVMBuildTrunc (builder, LLVMBuildSDiv (builder, dividend, divisor, ""), part_type, "");
last_divrem = LLVMBuildTrunc (builder, LLVMBuildSRem (builder, dividend, divisor, ""), part_type, "");
break;
}
case OP_X86_IDIVREMU:
case OP_X86_LDIVREMU: {
const LLVMTypeRef part_type = ins->opcode==OP_X86_IDIVREMU ? LLVMInt32Type () : LLVMInt64Type ();
const LLVMTypeRef full_type = ins->opcode==OP_X86_IDIVREMU ? LLVMInt64Type () : LLVMInt128Type ();
const LLVMValueRef shift_amount = ins->opcode==OP_X86_IDIVREMU ? const_int32 (32) : const_int32 (64);

LLVMValueRef dividend_low = LLVMBuildZExt (builder, convert (ctx, lhs, part_type), full_type, "");
LLVMValueRef dividend_high = LLVMBuildZExt (builder, convert (ctx, rhs, part_type), full_type, "");
LLVMValueRef dividend = LLVMBuildOr (builder, dividend_low,
LLVMBuildShl (builder, dividend_high, shift_amount, ""), "");
LLVMValueRef divisor = LLVMBuildZExt (builder, convert (ctx, arg3, part_type), full_type, "");
values [ins->dreg] = LLVMBuildTrunc (builder, LLVMBuildUDiv (builder, dividend, divisor, ""), part_type, "");
last_divrem = LLVMBuildTrunc (builder, LLVMBuildURem (builder, dividend, divisor, ""), part_type, "");
break;
}
case OP_X86_IDIVREM2:
case OP_X86_LDIVREM2: {
g_assert (last_divrem);
values [ins->dreg] = last_divrem;
last_divrem = NULL;
break;
}
#endif

case OP_ICONV_TO_I1:
Expand Down Expand Up @@ -12043,6 +12082,8 @@ MONO_RESTORE_WARNING
}
}

g_assert (last_divrem == NULL);

if (!ctx_ok (ctx))
return;

Expand Down
22 changes: 22 additions & 0 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,28 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG)

#endif

#if defined(TARGET_X86) || defined(TARGET_AMD64)
/*
* These operations exist to facilitate simultaneous int/uint division
* and remainder on x86/x86-64. On that platform the DIV/IDIV instructions
* operate as follows edx:eax/reg32 -> (eax=quotient,edx=remainder). Mono
* ops only support one destination register, so two operations are needed
* to obtain two result values. One would use {long,int}_divrem[_un] first,
* and the corresponding {long_int}_divrem2 immediately afterwards. The
* first instruction returns the quotient and leaves the remainder in the
* edx(rdx) register. The second instruction puts a virtual register over
* edx, so that its value can be used. Note that if the first instruction
* is emitted, the second must be also (there is an assert). This works
* both in LLVM and mini.
*/
MINI_OP3(OP_X86_LDIVREM, "long_divrem", LREG, LREG, LREG, LREG)
MINI_OP3(OP_X86_LDIVREMU, "long_divrem_un", LREG, LREG, LREG, LREG)
MINI_OP3(OP_X86_LDIVREM2, "long_divrem2", LREG, NONE, NONE, NONE)
MINI_OP3(OP_X86_IDIVREM, "int_divrem", IREG, IREG, IREG, IREG)
MINI_OP3(OP_X86_IDIVREMU, "int_divrem_un", IREG, IREG, IREG, IREG)
MINI_OP3(OP_X86_IDIVREM2, "int_divrem2", IREG, NONE, NONE, NONE)
#endif

MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE)
MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)

Expand Down
19 changes: 19 additions & 0 deletions src/mono/mono/mini/mini-x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -2877,6 +2877,19 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_X86_LDIVREM:
case OP_X86_LDIVREMU:
case OP_X86_LDIVREM2:
g_assert_not_reached ();
break;
case OP_X86_IDIVREM:
case OP_X86_IDIVREMU:
x86_div_reg (code, ins->sreg3, ins->opcode==OP_X86_IDIVREM);
break;
case OP_X86_IDIVREM2:
if (ins->dreg != X86_EDX)
x86_mov_reg_reg (code, ins->dreg, X86_EDX);
break;
case OP_IOR:
x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2);
break;
Expand Down Expand Up @@ -3309,6 +3322,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_X86_XCHG:
x86_xchg_reg_reg (code, ins->sreg1, ins->sreg2, 4);
break;
case OP_X86_BSF32:
x86_bsf (code, ins->dreg, ins->sreg1);
break;
case OP_X86_BSR32:
x86_bsr (code, ins->dreg, ins->sreg1);
break;
case OP_LOCALLOC:
/* keep alignment */
x86_alu_reg_imm (code, X86_ADD, ins->sreg1, MONO_ARCH_LOCALLOC_ALIGNMENT - 1);
Expand Down
46 changes: 45 additions & 1 deletion src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -4599,6 +4599,7 @@ static SimdIntrinsic bmi2_methods [] = {
static SimdIntrinsic x86base_methods [] = {
{SN_BitScanForward},
{SN_BitScanReverse},
{SN_DivRem},
{SN_Pause, OP_XOP, INTRINS_SSE_PAUSE},
{SN_get_IsSupported}
};
Expand All @@ -4620,7 +4621,7 @@ static const IntrinGroup supported_x86_intrinsics [] = {
{ "Sse41", MONO_CPU_X86_SSE41, sse41_methods, sizeof (sse41_methods) },
{ "Sse42", MONO_CPU_X86_SSE42, sse42_methods, sizeof (sse42_methods) },
{ "Ssse3", MONO_CPU_X86_SSSE3, ssse3_methods, sizeof (ssse3_methods) },
{ "X86Base", 0, x86base_methods, sizeof (x86base_methods) },
{ "X86Base", MONO_CPU_INITED, x86base_methods, sizeof (x86base_methods), TRUE },
{ "X86Serialize", 0, unsupported, sizeof (unsupported) },
};

Expand Down Expand Up @@ -5246,6 +5247,49 @@ emit_x86_intrinsics (
ins->type = is_64bit ? STACK_I8 : STACK_I4;
MONO_ADD_INS (cfg->cbb, ins);
return ins;
case SN_DivRem: {
g_assert (!(TARGET_SIZEOF_VOID_P == 4 && is_64bit)); // x86(no -64) cannot do divisions with 64-bit regs
const MonoStackType divtype = is_64bit ? STACK_I8 : STACK_I4;
const int storetype = is_64bit ? OP_STOREI8_MEMBASE_REG : OP_STOREI4_MEMBASE_REG;
const int obj_size = MONO_ABI_SIZEOF (MonoObject);

// We must decide by the second argument, the first is always unsigned here
MonoTypeEnum arg1_type = fsig->param_count > 1 ? get_underlying_type (fsig->params [1]) : MONO_TYPE_VOID;
MonoInst* div;
MonoInst* div2;

if (type_enum_is_unsigned (arg1_type)) {
MONO_INST_NEW (cfg, div, is_64bit ? OP_X86_LDIVREMU : OP_X86_IDIVREMU);
} else {
MONO_INST_NEW (cfg, div, is_64bit ? OP_X86_LDIVREM : OP_X86_IDIVREM);
}
div->dreg = is_64bit ? alloc_lreg (cfg) : alloc_ireg (cfg);
div->sreg1 = args [0]->dreg; // we can use this directly, reg alloc knows that the contents will be destroyed
div->sreg2 = args [1]->dreg; // same here as ^
div->sreg3 = args [2]->dreg;
div->type = divtype;
MONO_ADD_INS (cfg->cbb, div);

// Protect the contents of edx/rdx by assigning it a vreg. The instruction must
// immediately follow DIV/IDIV so that edx content is not modified.
// In LLVM the remainder is already calculated, just need to capture it in a vreg.
MONO_INST_NEW (cfg, div2, is_64bit ? OP_X86_LDIVREM2 : OP_X86_IDIVREM2);
div2->dreg = is_64bit ? alloc_lreg (cfg) : alloc_ireg (cfg);
div2->type = divtype;
MONO_ADD_INS (cfg->cbb, div2);

// TODO: Can the creation of tuple be elided? (e.g. if deconstruction is used)
MonoInst* tuple = mono_compile_create_var (cfg, fsig->ret, OP_LOCAL);
MonoInst* tuple_addr;
EMIT_NEW_TEMPLOADA (cfg, tuple_addr, tuple->inst_c0);

MonoClassField* field1 = mono_class_get_field_from_name_full (tuple->klass, "Item1", NULL);
MONO_EMIT_NEW_STORE_MEMBASE (cfg, storetype, tuple_addr->dreg, field1->offset - obj_size, div->dreg);
MonoClassField* field2 = mono_class_get_field_from_name_full (tuple->klass, "Item2", NULL);
MONO_EMIT_NEW_STORE_MEMBASE (cfg, storetype, tuple_addr->dreg, field2->offset - obj_size, div2->dreg);
EMIT_NEW_TEMPLOAD (cfg, ins, tuple->inst_c0);
return ins;
}
default:
g_assert_not_reached ();
}
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/simd-methods.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ METHOD(ComputeCrc32C)
// X86Base
METHOD(BitScanForward)
METHOD(BitScanReverse)
METHOD(DivRem)
METHOD(Pause)
// Crypto
METHOD(FixedRotate)
Expand Down
9 changes: 0 additions & 9 deletions src/tests/issues.targets
Original file line number Diff line number Diff line change
Expand Up @@ -1213,15 +1213,6 @@
<ExcludeList Include = "$(XunitTestBinBase)/JIT/HardwareIntrinsics/X86/Sse42.X64/Crc32_*/**">
<Issue>https://github.com/dotnet/runtime/issues/54185</Issue>
</ExcludeList>
<ExcludeList Include = "$(XUnitTestBinBase)/JIT/HardwareIntrinsics/X86/X86Base/X86Base*/**">
<Issue>https://github.com/dotnet/runtime/issues/75767</Issue>
</ExcludeList>
<ExcludeList Include = "$(XUnitTestBinBase)/JIT/HardwareIntrinsics/X86/X86Base/DivRem*/**">
<Issue>https://github.com/dotnet/runtime/issues/75767</Issue>
</ExcludeList>
<ExcludeList Include = "$(XUnitTestBinBase)/JIT/HardwareIntrinsics/X86/X86Base.X64/X86Base.X64*/**">
<Issue>https://github.com/dotnet/runtime/issues/75767</Issue>
</ExcludeList>
<ExcludeList Include="$(XunitTestBinBase)/JIT/Directed/Convert/out_of_range_fp_to_int_conversions/*">
<Issue>Mono does not define out of range fp to int conversions</Issue>
</ExcludeList>
Expand Down

0 comments on commit a974719

Please sign in to comment.