Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wasm] Optimize constant i2/i4 shuffles in jiterpreter #86470

Merged
merged 1 commit into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/mono/mono/mini/interp/interp-simd-intrins.def
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal, 52)

// wasm only has a swizzle opcode for i8x16, none of the others
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 14)
// jiterp has special handling for i1 shuffles to secure a v8 optimization
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 0)
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle, 0)
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle, 0)
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle, 0)
Expand Down
123 changes: 86 additions & 37 deletions src/mono/wasm/runtime/jiterpreter-trace-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,10 @@ function is_backward_branch_target(
return false;
}

const knownConstantValues = new Map<number, number>();
type KnownConstantValue = number | Uint8Array;
const knownConstantValues = new Map<number, KnownConstantValue>();

function get_known_constant_value(builder: WasmBuilder, localOffset: number): number | undefined {
function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
if (isAddressTaken(builder, localOffset))
return undefined;

Expand Down Expand Up @@ -1370,8 +1371,11 @@ export function generateWasmBody(
) {
if (!emit_simd(builder, ip, opcode, opname, simdIntrinsArgCount, simdIntrinsIndex))
ip = abort;
else
else {
containsSimd = true;
// We need to do dreg invalidation differently for simd, especially to handle ldc
skipDregInvalidation = true;
}
} else if (opcodeValue === 0) {
// This means it was explicitly marked as no-value in the opcode value table
// so we can just skip over it. This is done for things like nops.
Expand Down Expand Up @@ -1517,6 +1521,9 @@ function append_stloc_tail(builder: WasmBuilder, offset: number, opcodeOrPrefix:
const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_store) ? 0 : 2;
builder.appendMemarg(offset, alignment);
invalidate_local(offset);
// HACK: Invalidate the second stack slot used by a simd vector
if (simdOpcode !== undefined)
invalidate_local(offset + 8);
}

// Pass bytesInvalidated=0 if you are reading from the local and the address will never be
Expand Down Expand Up @@ -3098,10 +3105,10 @@ function emit_simd(
case MintOpcode.MINT_SIMD_V128_LDC: {
if (builder.options.enableSimd && getIsWasmSimdSupported()) {
builder.local("pLocals");
builder.v128_const(
localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128)
);
const view = localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128);
builder.v128_const(view);
append_simd_store(builder, ip);
knownConstantValues.set(getArgU16(ip, 1), view);
} else {
// dest
append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
Expand Down Expand Up @@ -3281,11 +3288,34 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
builder.appendU8(WasmOpcode.i32_eqz);
append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
return true;
case SimdIntrinsic3.V128_I1_SHUFFLE: {
// Detect a constant indices vector and turn it into a const. This allows
// v8 to use a more optimized implementation of the swizzle opcode
const indicesOffset = getArgU16(ip, 3),
constantIndices = get_known_constant_value(builder, indicesOffset);

// Pre-load destination ptr
builder.local("pLocals");
// Load vec
append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);

if (typeof (constantIndices) === "object") {
// HACK: Use the known constant vector directly instead of loading it from memory.
builder.appendSimd(WasmSimdOpcode.v128_const);
builder.appendBytes(constantIndices);
} else {
// Load the indices from memory
append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
}

// we now have two vectors on the stack, the values and the byte indices
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
append_simd_store(builder, ip);
return true;
}
case SimdIntrinsic3.V128_I2_SHUFFLE:
case SimdIntrinsic3.V128_I4_SHUFFLE:
// FIXME: I8
// FIXME: Many uses of these shuffles have constant shuffle indices,
// which we could convert into bytes at compile time for vastly improved performance
return emit_shuffle(builder, ip, index === SimdIntrinsic3.V128_I2_SHUFFLE ? 8 : 4);
default:
return false;
Expand All @@ -3297,41 +3327,60 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
// implement i16 and i32 shuffles on top of wasm's only shuffle opcode by expanding the
// element shuffle indices into byte indices
function emit_shuffle(builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number): boolean {
const elementSize = 16 / elementCount;
const elementSize = 16 / elementCount,
indicesOffset = getArgU16(ip, 3),
constantIndices = get_known_constant_value(builder, indicesOffset);
mono_assert((elementSize === 2) || (elementSize === 4), "Unsupported shuffle element size");

// Pre-load destination ptr
builder.local("pLocals");
// Load vec
append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
// Load indices (in chars)
append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
// There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
if (elementCount === 4) {
// i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
if (typeof (constantIndices) === "object") {
// HACK: We have a known constant shuffle vector with char or int indices. Expand it to
// byte indices and then embed a new constant in the trace.
const newShuffleVector = new Uint8Array(sizeOfV128),
nativeIndices = (elementSize === 2)
? new Uint16Array(constantIndices.buffer, constantIndices.byteOffset, elementCount)
: new Uint32Array(constantIndices.buffer, constantIndices.byteOffset, elementCount);
for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) {
const elementIndex = nativeIndices[i];
for (let j = 0; j < elementSize; j++)
newShuffleVector[k + j] = (elementIndex * elementSize) + j;
}
// console.log(`shuffle w/element size ${elementSize} with constant indices ${nativeIndices} (${constantIndices}) -> byte indices ${newShuffleVector}`);
builder.appendSimd(WasmSimdOpcode.v128_const);
builder.appendBytes(newShuffleVector);
} else {
// Load indices (in chars)
append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
// There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
if (elementCount === 4) {
// i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
builder.v128_const(0);
builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
}
// Load a zero vector (narrow takes two vectors)
builder.v128_const(0);
builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
}
// Load a zero vector (narrow takes two vectors)
builder.v128_const(0);
// i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
// i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(i);
}
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
// multiply indices by 2 to scale from char indices to byte indices
builder.i32_const(elementCount === 4 ? 2 : 1);
builder.appendSimd(WasmSimdOpcode.i8x16_shl);
// now add 1 to the secondary lane of each char
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(j);
// i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
// i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(i);
}
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
// multiply indices by 2 to scale from char indices to byte indices
builder.i32_const(elementCount === 4 ? 2 : 1);
builder.appendSimd(WasmSimdOpcode.i8x16_shl);
// now add 1 to the secondary lane of each char
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(j);
}
}
// we can do a bitwise or since we know we previously multiplied all the lanes by 2
builder.appendSimd(WasmSimdOpcode.v128_or);
// we now have two vectors on the stack, the values and the byte indices
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
append_simd_store(builder, ip);
Expand Down