Skip to content

Commit

Permalink
[mono][jit] Transition the x86 backend to use SSE for fp arithmetic. (#…
Browse files Browse the repository at this point in the history
…65723)

* [mono][jit] Transition the x86 backend to use SSE for fp arithmetic.

* Add SSE2 and FCMOV to the cpu requirements for mono on x86.
* Also force the usage of r4fp on x86, the same as on arm.
* Most of the code is copied from amd64-codegen.h and
  mini-amd64.c.

* Reenable some tests.

* Fix build failures.

* Remove r4fp conditionals.

* Add missing RCONV_TO_I opcode.

* Fix OP_MOVE_F_TO_I4 and OP_MOVE_I4_TO_F.

* Remove fpstack support code.

* Fix warnings.

* Add back MONO_ARCH_FLOAT32_SUPPORTED on x86.

* Fix dreg type for r4_conv_to_i1 etc. opcodes.
  • Loading branch information
vargaz authored Aug 8, 2022
1 parent a194555 commit e71a958
Show file tree
Hide file tree
Showing 21 changed files with 1,071 additions and 891 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ public static void op_Increment(float value)
[InlineData(0.0f, 3.14f)]
[InlineData(4567.0f, -3.14f)]
[InlineData(4567.89101f, -3.14569f)]
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
public static void op_Addition(float left, float right)
{
NFloat result = new NFloat(left) + new NFloat(right);
Expand All @@ -253,7 +252,6 @@ public static void op_Addition(float left, float right)
[InlineData(0.0f, 3.14f)]
[InlineData(4567.0f, -3.14f)]
[InlineData(4567.89101f, -3.14569f)]
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
public static void op_Subtraction(float left, float right)
{
NFloat result = new NFloat(left) - new NFloat(right);
Expand All @@ -274,7 +272,6 @@ public static void op_Subtraction(float left, float right)
[InlineData(0.0f, 3.14f)]
[InlineData(4567.0f, -3.14f)]
[InlineData(4567.89101f, -3.14569f)]
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
public static void op_Multiply(float left, float right)
{
NFloat result = new NFloat(left) * new NFloat(right);
Expand All @@ -295,7 +292,6 @@ public static void op_Multiply(float left, float right)
[InlineData(0.0f, 3.14f)]
[InlineData(4567.0f, -3.14f)]
[InlineData(4567.89101f, -3.14569f)]
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
public static void op_Division(float left, float right)
{
NFloat result = new NFloat(left) / new NFloat(right);
Expand Down
509 changes: 500 additions & 9 deletions src/mono/mono/arch/x86/x86-codegen.h

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion src/mono/mono/mini/aot-compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -12638,7 +12638,8 @@ compile_asm (MonoAotCompile *acfg)
#define LD_NAME "clang"
#define LD_OPTIONS "-m32 -dynamiclib"
#elif defined(TARGET_X86) && !defined(TARGET_MACH)
#define LD_OPTIONS "-m elf_i386 -Bsymbolic"
#define LD_NAME "ld"
#define LD_OPTIONS "--shared -m elf_i386"
#elif defined(TARGET_ARM) && !defined(TARGET_ANDROID)
#define LD_NAME "gcc"
#define LD_OPTIONS "--shared -Wl,-Bsymbolic"
Expand Down
77 changes: 56 additions & 21 deletions src/mono/mono/mini/cpu-x86.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ endfinally: len:16
endfilter: src1:a len:16
get_ex_obj: dest:a len:16

ckfinite: dest:f src1:f len:32
ckfinite: dest:f src1:f len:40
ceq: dest:y len:6
cgt: dest:y len:6
cgt_un: dest:y len:6
Expand All @@ -153,14 +153,18 @@ localloc: dest:i src1:i len:120
compare: src1:i src2:i len:2
compare_imm: src1:i len:6
fcompare: src1:f src2:f clob:a len:9
rcompare: src1:f src2:f clob:a len:13
arglist: src1:b len:10
check_this: src1:b len:3
voidcall: len:17 clob:c
voidcall_reg: src1:i len:11 clob:c
voidcall_membase: src1:b len:16 clob:c
fcall: dest:f len:17 clob:c
fcall_reg: dest:f src1:i len:11 clob:c
fcall_membase: dest:f src1:b len:16 clob:c
fcall: dest:f len:28 clob:c
fcall_reg: dest:f src1:i len:28 clob:c
fcall_membase: dest:f src1:b len:28 clob:c
rcall: dest:f len:28 clob:c
rcall_reg: dest:f src1:i len:28 clob:c
rcall_membase: dest:f src1:b len:28 clob:c
lcall: dest:l len:17 clob:c
lcall_reg: dest:l src1:i len:11 clob:c
lcall_membase: dest:l src1:b len:16 clob:c
Expand All @@ -170,8 +174,8 @@ vcall_membase: src1:b len:16 clob:c
call_reg: dest:a src1:i len:11 clob:c
call_membase: dest:a src1:b len:16 clob:c
iconst: dest:i len:5
r4const: dest:f len:15
r8const: dest:f len:16
r4const: dest:f len:24
r8const: dest:f len:24
store_membase_imm: dest:b len:11
store_membase_reg: dest:b src1:i len:7
storei1_membase_imm: dest:b len:10
Expand All @@ -182,8 +186,8 @@ storei4_membase_imm: dest:b len:10
storei4_membase_reg: dest:b src1:i len:7
storei8_membase_imm: dest:b
storei8_membase_reg: dest:b src1:i
storer4_membase_reg: dest:b src1:f len:7
storer8_membase_reg: dest:b src1:f len:7
storer4_membase_reg: dest:b src1:f len:9
storer8_membase_reg: dest:b src1:f len:9
load_membase: dest:i src1:b len:7
loadi1_membase: dest:y src1:b len:7
loadu1_membase: dest:y src1:b len:7
Expand All @@ -192,8 +196,8 @@ loadu2_membase: dest:i src1:b len:7
loadi4_membase: dest:i src1:b len:7
loadu4_membase: dest:i src1:b len:7
loadi8_membase: dest:i src1:b
loadr4_membase: dest:f src1:b len:7
loadr8_membase: dest:f src1:b len:7
loadr4_membase: dest:f src1:b len:9
loadr8_membase: dest:f src1:b len:9
loadu4_mem: dest:i len:9
move: dest:i src1:i len:2
addcc_imm: dest:i src1:i len:6 clob:1
Expand Down Expand Up @@ -237,25 +241,26 @@ float_bge: len:22
float_bge_un: len:12
float_ble: len:22
float_ble_un: len:12
float_add: dest:f src1:f src2:f len:2
float_sub: dest:f src1:f src2:f len:2
float_mul: dest:f src1:f src2:f len:2
float_div: dest:f src1:f src2:f len:2
float_div_un: dest:f src1:f src2:f len:2
float_add: dest:f src1:f src2:f len:8
float_sub: dest:f src1:f src2:f len:8
float_mul: dest:f src1:f src2:f len:8
float_div: dest:f src1:f src2:f len:8
float_div_un: dest:f src1:f src2:f len:8
float_rem: dest:f src1:f src2:f len:17
float_rem_un: dest:f src1:f src2:f len:17
float_neg: dest:f src1:f len:2
float_neg: dest:f src1:f len:24
float_not: dest:f src1:f len:2
float_conv_to_i1: dest:y src1:f len:39
float_conv_to_i2: dest:y src1:f len:39
float_conv_to_i4: dest:i src1:f len:39
float_conv_to_i8: dest:L src1:f len:39
float_conv_to_i8: dest:L src1:f len:50
float_conv_to_u4: dest:i src1:f len:39
float_conv_to_u8: dest:L src1:f len:39
float_conv_to_u2: dest:y src1:f len:39
float_conv_to_u1: dest:y src1:f len:39
float_conv_to_ovf_i: dest:a src1:f len:30
float_conv_to_ovd_u: dest:a src1:f len:30
float_conv_to_r4: dest:f src1:f len:17
float_mul_ovf:
float_ceq: dest:y src1:f src2:f len:25
float_cgt: dest:y src1:f src2:f len:25
Expand Down Expand Up @@ -312,7 +317,7 @@ sbb_imm: dest:i src1:i len:6 clob:1
br_reg: src1:i len:2
sin: dest:f src1:f len:6
cos: dest:f src1:f len:6
abs: dest:f src1:f len:2
abs: dest:f src1:f clob:1 len:16
tan: dest:f src1:f len:49
atan: dest:f src1:f len:8
sqrt: dest:f src1:f len:2
Expand Down Expand Up @@ -423,11 +428,12 @@ cmov_ile_un: dest:i src1:i src2:i len:16 clob:1
cmov_ilt_un: dest:i src1:i src2:i len:16 clob:1

long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:30
long_conv_to_r8_2: dest:f src1:i src2:i len:14
long_conv_to_r4_2: dest:f src1:i src2:i len:14
long_conv_to_r8_2: dest:f src1:i src2:i len:24
long_conv_to_r4_2: dest:f src1:i src2:i len:24
long_conv_to_r_un_2: dest:f src1:i src2:i len:40

fmove: dest:f src1:f
fmove: dest:f src1:f len:4
rmove: dest:f src1:f len:4
move_f_to_i4: dest:i src1:f len:17
move_i4_to_f: dest:f src1:i len:17
float_conv_to_r4: dest:f src1:f len:12
Expand Down Expand Up @@ -671,3 +677,32 @@ set_sp: src1:i len:6
fill_prof_call_ctx: src1:i len:128

get_last_error: dest:i len:32

x86_move_r8_to_fpstack: src1:f len:16
x86_move_r4_to_fpstack: src1:f len:16
iconv_to_r4_raw: dest:f src1:i len:10

# R4 opcodes
r4_conv_to_i1: dest:y src1:f len:32
r4_conv_to_u1: dest:y src1:f len:32
r4_conv_to_i2: dest:y src1:f len:32
r4_conv_to_u2: dest:y src1:f len:32
r4_conv_to_i4: dest:i src1:f len:16
r4_conv_to_u4: dest:i src1:f len:32
r4_conv_to_i8: dest:L src1:f len:64
r4_conv_to_i: dest:i src1:f len:32
r4_conv_to_r8: dest:f src1:f len:17
r4_conv_to_r4: dest:f src1:f len:17
r4_add: dest:f src1:f src2:f clob:1 len:5
r4_sub: dest:f src1:f src2:f clob:1 len:5
r4_mul: dest:f src1:f src2:f clob:1 len:5
r4_div: dest:f src1:f src2:f clob:1 len:5
r4_neg: dest:f src1:f clob:1 len:23
r4_ceq: dest:y src1:f src2:f len:35
r4_cgt: dest:y src1:f src2:f len:35
r4_cgt_un: dest:y src1:f src2:f len:48
r4_clt: dest:y src1:f src2:f len:35
r4_clt_un: dest:y src1:f src2:f len:42
r4_cneq: dest:y src1:f src2:f len:42
r4_cge: dest:y src1:f src2:f len:35
r4_cle: dest:y src1:f src2:f len:35
17 changes: 2 additions & 15 deletions src/mono/mono/mini/local-propagation.c
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,6 @@ mono_local_cprop (MonoCompile *cfg)
/* This avoids propagating local vregs across calls */
((get_vreg_to_inst (cfg, def->sreg1) || !defs [def->sreg1] || (def_index [def->sreg1] >= last_call_index) || (def->opcode == OP_VMOVE))) &&
!(defs [def->sreg1] && mono_inst_next (defs [def->sreg1], filter) == def) &&
(!MONO_ARCH_USE_FPSTACK || (def->opcode != OP_FMOVE)) &&
(def->opcode != OP_FMOVE)) {
int vreg = def->sreg1;

Expand All @@ -640,7 +639,7 @@ mono_local_cprop (MonoCompile *cfg)
/* is_inst_imm is only needed for binops */
if ((((def->opcode == OP_ICONST) || ((sizeof (gpointer) == 8) && (def->opcode == OP_I8CONST)) || (def->opcode == OP_PCONST)))
||
(!MONO_ARCH_USE_FPSTACK && (def->opcode == OP_R8CONST))) {
(def->opcode == OP_R8CONST)) {
guint32 opcode2;

/* srcindex == 1 -> binop, ins->sreg2 == -1 -> unop */
Expand Down Expand Up @@ -815,17 +814,6 @@ mono_local_cprop (MonoCompile *cfg)
}
}

static gboolean
reg_is_softreg_no_fpstack (int reg, const char spec)
{
return (spec == 'i' && reg >= MONO_MAX_IREGS)
|| ((spec == 'f' && reg >= MONO_MAX_FREGS) && !MONO_ARCH_USE_FPSTACK)
#ifdef MONO_ARCH_SIMD_INTRINSICS
|| (spec == 'x' && reg >= MONO_MAX_XREGS)
#endif
|| (spec == 'v');
}

static gboolean
reg_is_softreg (int reg, const char spec)
{
Expand Down Expand Up @@ -953,8 +941,7 @@ mono_local_deadce (MonoCompile *cfg)
}
}

/* Enabling this on x86 could screw up the fp stack */
if (reg_is_softreg_no_fpstack (ins->dreg, spec [MONO_INST_DEST])) {
if (reg_is_softreg (ins->dreg, spec [MONO_INST_DEST])) {
/*
* Assignments to global vregs can only be eliminated if there is another
* assignment to the same vreg later in the same bblock.
Expand Down
10 changes: 2 additions & 8 deletions src/mono/mono/mini/method-to-ir.c
Original file line number Diff line number Diff line change
Expand Up @@ -7181,12 +7181,6 @@ mono_method_to_ir (MonoCompile *cfg, MonoMethod *method, MonoBasicBlock *start_b
}
case MONO_CEE_POP:
--sp;

#ifdef TARGET_X86
if (sp [0]->type == STACK_R8)
/* we need to pop the value from the x86 FP stack */
MONO_EMIT_NEW_UNALU (cfg, OP_X86_FPOP, -1, sp [0]->dreg);
#endif
break;
case MONO_CEE_JMP: {
MonoCallInst *call;
Expand Down Expand Up @@ -13057,7 +13051,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts)
* sregs could use it. So set a flag, and do it after
* the sregs.
*/
if ((!cfg->backend->use_fpstack || ((store_opcode != OP_STORER8_MEMBASE_REG) && (store_opcode != OP_STORER4_MEMBASE_REG))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)))
if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)))
dest_has_lvreg = TRUE;
}
}
Expand Down Expand Up @@ -13147,7 +13141,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts)

sreg = alloc_dreg (cfg, stacktypes [regtype]);

if ((!cfg->backend->use_fpstack || ((load_opcode != OP_LOADR8_MEMBASE) && (load_opcode != OP_LOADR4_MEMBASE))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) {
if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) {
if (var->dreg == prev_dreg) {
/*
* sreg refers to the value loaded by the load
Expand Down
4 changes: 0 additions & 4 deletions src/mono/mono/mini/mini-amd64.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,6 @@ struct sigcontext {
#define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1
#endif



#if defined(__APPLE__)
#define MONO_ARCH_SIGNAL_STACK_SIZE MINSIGSTKSZ
#else
Expand Down Expand Up @@ -164,8 +162,6 @@ struct sigcontext {
#define MONO_ARCH_CALLEE_REGS AMD64_CALLEE_REGS
#define MONO_ARCH_CALLEE_SAVED_REGS AMD64_CALLEE_SAVED_REGS

#define MONO_ARCH_USE_FPSTACK FALSE

#define MONO_ARCH_INST_FIXED_REG(desc) ((desc == '\0') ? -1 : ((desc == 'i' ? -1 : ((desc == 'a') ? AMD64_RAX : ((desc == 's') ? AMD64_RCX : ((desc == 'd') ? AMD64_RDX : ((desc == 'A') ? MONO_AMD64_ARG_REG1 : -1)))))))

/* RDX is clobbered by the opcode implementation before accessing sreg2 */
Expand Down
2 changes: 0 additions & 2 deletions src/mono/mono/mini/mini-arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,6 @@
#define MONO_ARCH_CALLEE_SAVED_FREGS 0x00000000
#endif

#define MONO_ARCH_USE_FPSTACK FALSE

#define MONO_ARCH_INST_SREG2_MASK(ins) (0)

#define MONO_ARCH_INST_FIXED_REG(desc) \
Expand Down
4 changes: 0 additions & 4 deletions src/mono/mono/mini/mini-arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@

#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS

#define MONO_ARCH_USE_FPSTACK FALSE

#define MONO_ARCH_INST_SREG2_MASK(ins) (0)

#define MONO_ARCH_INST_FIXED_REG(desc) ((desc) == 'a' ? ARMREG_R0 : -1)
Expand All @@ -68,8 +66,6 @@

#define MONO_ARCH_INST_REGPAIR_REG2(desc,hreg1) (-1)

#define MONO_ARCH_USE_FPSTACK FALSE

#define MONO_ARCH_FRAME_ALIGNMENT 16

#define MONO_ARCH_CODE_ALIGNMENT 32
Expand Down
Loading

0 comments on commit e71a958

Please sign in to comment.