Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support to mono for v128 constants #81902

Merged
merged 7 commits into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/mono/mono/mini/aot-compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -7005,6 +7005,13 @@ encode_patch (MonoAotCompile *acfg, MonoJumpInfo *patch_info, guint8 *buf, guint
encode_value (((guint32 *)patch_info->data.target) [MINI_LS_WORD_IDX], p, &p);
encode_value (((guint32 *)patch_info->data.target) [MINI_MS_WORD_IDX], p, &p);
break;
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_X128_GOT:
encode_value (((guint32 *)patch_info->data.target) [(MINI_LS_WORD_IDX * 2) + MINI_LS_WORD_IDX], p, &p);
encode_value (((guint32 *)patch_info->data.target) [(MINI_LS_WORD_IDX * 2) + MINI_MS_WORD_IDX], p, &p);
encode_value (((guint32 *)patch_info->data.target) [(MINI_MS_WORD_IDX * 2) + MINI_LS_WORD_IDX], p, &p);
vargaz marked this conversation as resolved.
Show resolved Hide resolved
encode_value (((guint32 *)patch_info->data.target) [(MINI_MS_WORD_IDX * 2) + MINI_MS_WORD_IDX], p, &p);
Comment on lines +7008 to +7013
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, there isn't any SIMD support for BE architectures today.

It's possibly this needs to track the underlying element type and splat each T in memory correctly instead.

break;
case MONO_PATCH_INFO_VTABLE:
case MONO_PATCH_INFO_CLASS:
case MONO_PATCH_INFO_IID:
Expand Down
20 changes: 20 additions & 0 deletions src/mono/mono/mini/aot-runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -3842,6 +3842,26 @@ decode_patch (MonoAotModule *aot_module, MonoMemPool *mp, MonoJumpInfo *ji, guin
*(double*)ji->data.target = *(double*)&v;
break;
}
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_X128_GOT: {
guint32 val [4];
guint64 v[2];

// FIXME: Align to 16 bytes ?
ji->data.target = mono_mem_manager_alloc0 (mem_manager, 16);

val [0] = decode_value (p, &p);
val [1] = decode_value (p, &p);
val [2] = decode_value (p, &p);
val [3] = decode_value (p, &p);

v[0] = ((guint64)val [1] << 32) | ((guint64)val [0]);
((guint64*)ji->data.target)[0] = v[0];

v[1] = ((guint64)val [3] << 32) | ((guint64)val [2]);
((guint64*)ji->data.target)[1] = v[1];
break;
}
case MONO_PATCH_INFO_LDSTR:
image = load_image (aot_module, decode_value (p, &p), error);
mono_error_cleanup (error); /* FIXME don't swallow the error */
Expand Down
9 changes: 7 additions & 2 deletions src/mono/mono/mini/cfold.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,15 @@ mono_constant_fold_ins (MonoCompile *cfg, MonoInst *ins, MonoInst *arg1, MonoIns
}
break;
case OP_XMOVE:
if (arg1->opcode == OP_XZERO) {
if ((arg1->opcode == OP_XZERO) || (arg1->opcode == OP_XONES)) {
ALLOC_DEST (cfg, dest, ins);
dest->opcode = OP_XZERO;
dest->opcode = arg1->opcode;
dest->sreg1 = -1;
} else if (arg1->opcode == OP_XCONST) {
ALLOC_DEST (cfg, dest, ins);
dest->opcode = arg1->opcode;
dest->sreg1 = -1;
dest->inst_p0 = arg1->inst_p0;
}
break;
case OP_COMPARE:
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/cpu-amd64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,7 @@ cvttps2dq: dest:x src1:x len:5 clob:1
xmove: dest:x src1:x len:5
xzero: dest:x len:5
xones: dest:x len:5
xconst: dest:x len:12

iconv_to_x: dest:x src1:i len:5
extract_i4: dest:i src1:x len:5
Expand Down
4 changes: 2 additions & 2 deletions src/mono/mono/mini/liveness.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ optimize_initlocals (MonoCompile *cfg)
//printf ("DEAD: "); mono_print_ins (ins);
if (cfg->disable_initlocals_opt_refs && var->type == STACK_OBJ)
continue;
if ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST) || (ins->opcode == OP_R4CONST)) {
if ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST) || (ins->opcode == OP_R4CONST) || (ins->opcode == OP_XCONST)) {
NULLIFY_INS (ins);
MONO_VARINFO (cfg, var->inst_c0)->spill_costs -= 1;
/*
Expand Down Expand Up @@ -874,7 +874,7 @@ update_liveness2 (MonoCompile *cfg, MonoInst *ins, gboolean set_volatile, int in
}
else {
/* Try dead code elimination */
if (!cfg->disable_deadce_vars && (var != cfg->ret) && !(var->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST)) && !(var->flags & MONO_INST_VOLATILE)) {
if (!cfg->disable_deadce_vars && (var != cfg->ret) && !(var->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST) || (ins->opcode == OP_XCONST)) && !(var->flags & MONO_INST_VOLATILE)) {
LIVENESS_DEBUG (printf ("\tdead def of R%d, eliminated\n", ins->dreg));
NULLIFY_INS (ins);
return;
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/method-to-ir.c
Original file line number Diff line number Diff line change
Expand Up @@ -12640,6 +12640,7 @@ mono_op_no_side_effects (int opcode)
case OP_VZERO:
case OP_XZERO:
case OP_XONES:
case OP_XCONST:
case OP_ICONST:
case OP_I8CONST:
case OP_ADD_IMM:
Expand Down
48 changes: 45 additions & 3 deletions src/mono/mono/mini/mini-amd64.c
Original file line number Diff line number Diff line change
Expand Up @@ -7296,6 +7296,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_XONES:
amd64_sse_pcmpeqb_reg_reg (code, ins->dreg, ins->dreg);
break;
case OP_XCONST: {
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof(gpointer));
amd64_sse_movups_reg_membase (code, ins->dreg, AMD64_R11, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
amd64_sse_movups_reg_membase (code, ins->dreg, AMD64_RIP, 0);
}
break;
}
case OP_ICONV_TO_R4_RAW:
amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
if (!cfg->r4fp)
Expand Down Expand Up @@ -8140,11 +8151,13 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
for (patch_info = cfg->patch_info; patch_info; patch_info = patch_info->next) {
if (patch_info->type == MONO_PATCH_INFO_EXC)
code_size += 40;
if (patch_info->type == MONO_PATCH_INFO_R8)
else if (patch_info->type == MONO_PATCH_INFO_X128)
code_size += 16 + 15; /* sizeof (Vector128<T>) + alignment */
else if (patch_info->type == MONO_PATCH_INFO_R8)
code_size += 8 + 15; /* sizeof (double) + alignment */
if (patch_info->type == MONO_PATCH_INFO_R4)
else if (patch_info->type == MONO_PATCH_INFO_R4)
code_size += 4 + 15; /* sizeof (float) + alignment */
if (patch_info->type == MONO_PATCH_INFO_GC_CARD_TABLE_ADDR)
else if (patch_info->type == MONO_PATCH_INFO_GC_CARD_TABLE_ADDR)
Comment on lines +8154 to +8160
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per the FIXME comment below, the alignment here only needs to be 8 for double and 4 for float. But some other logic was utilizing R4/R8 for packed SIMD instructions.

That should ideally be fixed and those updated to properly use X128 constants to save on space and help ensure correctness.

code_size += 8 + 7; /*sizeof (void*) + alignment */
}

Expand Down Expand Up @@ -8218,6 +8231,10 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
guint8 *pos, *patch_pos;
guint32 target_pos;

// FIXME: R8 and R4 only need 8 and 4 byte alignment, respectively
// They should be handled separately with anything needing 16 byte
// alignment using X128

/* The SSE opcodes require a 16 byte alignment */
code = (guint8*)ALIGN_TO (code, 16);

Expand All @@ -8244,6 +8261,31 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
remove = TRUE;
break;
}
case MONO_PATCH_INFO_X128: {
guint8 *pos, *patch_pos;
guint32 target_pos;

/* The SSE opcodes require a 16 byte alignment */
code = (guint8*)ALIGN_TO (code, 16);

pos = cfg->native_code + patch_info->ip.i;
if (IS_REX (pos [0])) {
patch_pos = pos + 4;
target_pos = GPTRDIFF_TO_UINT32 (code - pos - 8);
}
else {
patch_pos = pos + 3;
target_pos = GPTRDIFF_TO_UINT32 (code - pos - 7);
}

memcpy (code, patch_info->data.target, 16);
code += 16;

*(guint32*)(patch_pos) = target_pos;

remove = TRUE;
break;
}
Comment on lines +8264 to +8288
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original handling I added was the cause of the failures.

MonoJIT doesn't support the VEX encoding today (only the legacy encoding) and the R4/R8 patch point logic is effectively hardcoded to movss/movsd.

For that scenario, we have:

  • 1-byte prefix of 0xF3 (float) or 0xF2 (double)
  • Optional REX prefix of 0x44 if using extended registers (XMM8-XMM15)
  • 2-byte opcode of 0x0F 0x010
  • 1-byte modR/M byte
  • 4-byte RIP relative offset

So this the R4/R8 logic was getting the offset of the RIP relative offset so it could be patched.


For X128 we currently hardcode this to movups and so we instead have:

  • Optional REX prefix of 0x44 if using extended registers (XMM8-XMM15)
  • 2-byte opcode of 0x0F 0x010
  • 1-byte modR/M byte
  • 4-byte RIP relative offset

Thus it's 1-byte smaller. We could but don't want to emit movupd or movdqu as that's 1-byte larger for no benefit.

We could emit movaps instead, but there isn't any benefit to that on modern machines (anything since ~2011) and it strictly requires that the address be aligned, which is overall less flexible.

When the VEX support is added this all becomes a constant 4-bytes before the RIP relative offset is encoded (3-byte VEX encoded opcode + modR/M byte).

case MONO_PATCH_INFO_GC_CARD_TABLE_ADDR: {
guint8 *pos;

Expand Down
88 changes: 88 additions & 0 deletions src/mono/mono/mini/mini-llvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -7844,6 +7844,94 @@ MONO_RESTORE_WARNING
values [ins->dreg] = LLVMConstAllOnes (type_to_llvm_type (ctx, m_class_get_byval_arg (ins->klass)));
break;
}
case OP_XCONST: {
MonoTypeEnum etype;
int ecount;

const char *class_name = m_class_get_name (ins->klass);

if (!strcmp(class_name, "Vector`1") || !strcmp(class_name, "Vector64`1") || !strcmp (class_name, "Vector128`1") || !strcmp (class_name, "Vector256`1") || !strcmp (class_name, "Vector512`1")) {
MonoType *element_type = mono_class_get_context (ins->klass)->class_inst->type_argv [0];
etype = element_type->type;
ecount = mono_class_value_size (ins->klass, NULL) / mono_class_value_size (mono_class_from_mono_type_internal(element_type), NULL);
} else if (!strcmp(class_name, "Vector4") || !strcmp(class_name, "Plane") || !strcmp(class_name, "Quaternion")) {
etype = MONO_TYPE_R4;
ecount = 4;
} else {
g_assert_not_reached ();
}

LLVMTypeRef llvm_type = primitive_type_to_llvm_type(etype);
LLVMValueRef vals [64];

switch (etype) {
case MONO_TYPE_I1:
case MONO_TYPE_U1: {
guint8* v = (guint8*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
case MONO_TYPE_I2:
case MONO_TYPE_U2: {
guint16* v = (guint16*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
#if TARGET_SIZEOF_VOID_P == 4
case MONO_TYPE_I:
case MONO_TYPE_U:
#endif
case MONO_TYPE_I4:
case MONO_TYPE_U4: {
guint32* v = (guint32*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
#if TARGET_SIZEOF_VOID_P == 8
case MONO_TYPE_I:
case MONO_TYPE_U:
#endif
case MONO_TYPE_I8:
case MONO_TYPE_U8: {
guint64* v = (guint64*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
case MONO_TYPE_R4: {
float* v = (float*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstReal (llvm_type, v[i]);
}
break;
}
case MONO_TYPE_R8: {
double* v = (double*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstReal (llvm_type, v[i]);
}
break;
}
default:
g_assert_not_reached ();
}

values [ins->dreg] = LLVMConstVector (vals, ecount);
break;
}
case OP_LOADX_MEMBASE: {
LLVMTypeRef t = type_to_llvm_type (ctx, m_class_get_byval_arg (ins->klass));
LLVMValueRef src;
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,7 @@ MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)
MINI_OP(OP_XMOVE, "xmove", XREG, XREG, NONE)
MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE)
MINI_OP(OP_XONES, "xones", XREG, NONE, NONE)
MINI_OP(OP_XCONST, "xconst", XREG, NONE, NONE)
MINI_OP(OP_XPHI, "xphi", XREG, NONE, NONE)

/*
Expand Down
4 changes: 4 additions & 0 deletions src/mono/mono/mini/mini-runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -1281,6 +1281,8 @@ mono_patch_info_hash (gconstpointer data)
}
case MONO_PATCH_INFO_GSHAREDVT_IN_WRAPPER:
return hash | mono_signature_hash (ji->data.sig);
case MONO_PATCH_INFO_X128_GOT:
return hash | (guint32)*(double*)ji->data.target;
case MONO_PATCH_INFO_R8_GOT:
return hash | (guint32)*(double*)ji->data.target;
Comment on lines +1284 to 1287
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how the hashes are currently being built for R4/R8, but they don't necessarily make "sense" to me.

This isn't a "good" hash since it just or's bits together, it doesn't take into account the "upper bits", and it relies on undefined behavior in the case the double or float is NaN/Infinity/out of range.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not the best, but its just used during JITting/AOTing, so it's not a problem in practice.

case MONO_PATCH_INFO_R4_GOT:
Expand Down Expand Up @@ -1605,6 +1607,8 @@ mono_resolve_patch_target_ext (MonoMemoryManager *mem_manager, MonoMethod *metho
case MONO_PATCH_INFO_R4_GOT:
case MONO_PATCH_INFO_R8:
case MONO_PATCH_INFO_R8_GOT:
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_X128_GOT:
target = patch_info->data.target;
break;
case MONO_PATCH_INFO_EXC_NAME:
Expand Down
6 changes: 4 additions & 2 deletions src/mono/mono/mini/patch-info.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ PATCH_INFO(LDTOKEN, "ldtoken")
PATCH_INFO(TYPE_FROM_HANDLE, "type_from_handle")
PATCH_INFO(R4, "r4")
PATCH_INFO(R8, "r8")
PATCH_INFO(X128, "x128")
PATCH_INFO(IP, "ip")
PATCH_INFO(IID, "iid")
PATCH_INFO(ADJUSTED_IID, "adjusted_iid")
Expand Down Expand Up @@ -76,12 +77,13 @@ PATCH_INFO(SPECIFIC_TRAMPOLINES, "specific_trampolines")
PATCH_INFO(SPECIFIC_TRAMPOLINES_GOT_SLOTS_BASE, "specific_trampolines_got_slots_base")

/*
* PATCH_INFO_R4/R8 is handled by mono_arch_emit_exceptions () by emitting the data
* into the text segment after the method body. These patches emit the data
* PATCH_INFO_* is handled by mono_arch_emit_exceptions () by emitting the data
* into the text segment after the method body. PATCH_INFO_*_GOT emit the data
* elsewhere.
*/
PATCH_INFO(R8_GOT, "r8_got")
PATCH_INFO(R4_GOT, "r4_got")
PATCH_INFO(X128_GOT, "x128_got")

/* MonoMethod* -> the address of a memory slot which is used to cache the pinvoke address */
PATCH_INFO(METHOD_PINVOKE_ADDR_CACHE, "pinvoke_addr_cache")
Expand Down
Loading