From 7ea273641cb97cd9412c44dc15c29818315bdd4b Mon Sep 17 00:00:00 2001 From: GorogPeter Date: Tue, 31 Oct 2023 12:00:23 +0100 Subject: [PATCH] Add X86 SIMD float/double abs/max/min --- src/jit/SimdX86Inl.h | 110 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/src/jit/SimdX86Inl.h b/src/jit/SimdX86Inl.h index 1a71065f1..b7e83114f 100644 --- a/src/jit/SimdX86Inl.h +++ b/src/jit/SimdX86Inl.h @@ -48,6 +48,8 @@ enum Type : uint32_t { cmpleps = OPCODE_AND_IMM(0xc2, 2), cmpltpd = OPCODE_AND_IMM(0xc2, 1) | SimdOp::prefix66, cmpltps = OPCODE_AND_IMM(0xc2, 1), + cmpunordpd = OPCODE_AND_IMM(0xc2, 3) | SimdOp::prefix66, + cmpunordps = OPCODE_AND_IMM(0xc2, 3), cmpneqpd = OPCODE_AND_IMM(0xc2, 4) | SimdOp::prefix66, cmpneqps = OPCODE_AND_IMM(0xc2, 4), cmpnltpd = OPCODE_AND_IMM(0xc2, 5) | SimdOp::prefix66, @@ -63,6 +65,7 @@ enum Type : uint32_t { maxpd = 0x5f | SimdOp::prefix66, maxps = 0x5f, minpd = 0x5d | SimdOp::prefix66, + minps = 0x5d, mulpd = 0x59 | SimdOp::prefix66, mulps = 0x59, orpd = 0x56 | SimdOp::prefix66, @@ -365,7 +368,7 @@ static void simdEmitINeg(sljit_compiler* compiler, uint32_t signOpcode, uint32_t return; } - ASSERT(subOpcode = SimdOp::psubq); + ASSERT(subOpcode == SimdOp::psubq); if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { simdEmitSSEOp(compiler, SimdOp::pxor, tmp, tmp); @@ -625,6 +628,21 @@ static void simdEmitTruncSatF32x4U(sljit_compiler* compiler, sljit_s32 rd, sljit simdEmitSSEOp(compiler, SimdOp::paddd, rd, tmp1); } +static void simdEmitAbs(sljit_compiler* compiler, sljit_s32 rd, sljit_s32 rn, bool is64) { + sljit_s32 tmp = SLJIT_FR1; + simdEmitSSEOp(compiler, (is64 ? SimdOp::pcmpeqq : SimdOp::pcmpeqd), tmp, tmp); + simdEmitSSEOp(compiler, OPCODE_AND_IMM((is64 ? SimdOp::psrlq_i : SimdOp::psrld_i) , 1), SimdOp::psrl_i_arg, tmp); + if (rd != rn) { + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, (is64 ? SimdOp::andpd : SimdOp::andps), rd, rn, tmp); + return; + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, rd, rn, 0); + } + } + simdEmitSSEOp(compiler, (is64 ? SimdOp::andpd : SimdOp::andps), rd, tmp); +} + static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) { Operand* operands = instr->operands(); @@ -694,6 +712,7 @@ static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) srcType = SLJIT_SIMD_ELEM_64; dstType = SLJIT_SIMD_ELEM_64; break; + case ByteCode::F32X4AbsOpcode: case ByteCode::F32X4NegOpcode: case ByteCode::F32X4SqrtOpcode: srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; @@ -708,6 +727,7 @@ static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; break; + case ByteCode::F64X2AbsOpcode: case ByteCode::F64X2NegOpcode: case ByteCode::F64X2SqrtOpcode: srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; @@ -799,6 +819,9 @@ static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) simdEmitSSEOp(compiler, OPCODE_AND_IMM(SimdOp::pshufd, 0xe), dst, args[0].arg); simdEmitSSEOp(compiler, SimdOp::pmovzxwd, dst, dst); break; + case ByteCode::F32X4AbsOpcode: + simdEmitAbs(compiler, dst, args[0].arg, false); + break; case ByteCode::F32X4NegOpcode: simdEmitUnaryImm(compiler, SimdOp::xorps, dst, args[0].arg); break; @@ -832,6 +855,9 @@ static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F32X4DemoteF64X2ZeroOpcode: simdEmitSSEOp(compiler, SimdOp::cvtpd2ps, dst, args[0].arg); break; + case ByteCode::F64X2AbsOpcode: + simdEmitAbs(compiler, dst, args[0].arg, true); + break; case ByteCode::F64X2NegOpcode: simdEmitUnaryImm(compiler, SimdOp::xorpd, dst, args[0].arg); break; @@ -907,6 +933,70 @@ static void simdEmitPMinMax(sljit_compiler* compiler, uint32_t operation, sljit_ simdEmitSSEOp(compiler, is64 ? SimdOp::orpd : SimdOp::orps, rd, tmp); } +static void simdEmitFloatMax(sljit_compiler* compiler, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, bool is64) { + sljit_s32 tmp = SLJIT_FR2; + if (rd == rn || rd == rm) { + sljit_s32 src = (rd == rn) ? rm : rn; + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), tmp, src, rd); + simdEmitVexOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), rd, rd, src); + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, tmp, src, 0); + simdEmitSSEOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), tmp, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), rd, src); + } + } else { + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), tmp, rn, rm); + simdEmitVexOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), rd, rm, rn); + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, tmp, rn, 0); + simdEmitSSEOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), tmp, rm); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, rd, rm, 0); + simdEmitSSEOp(compiler, (is64 ? SimdOp::maxpd : SimdOp::maxps), rd, rn); + } + } + + simdEmitSSEOp(compiler, (is64 ? SimdOp::xorpd : SimdOp::xorps), rd, tmp); + simdEmitSSEOp(compiler, (is64 ? SimdOp::orpd : SimdOp::orps), tmp, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::subpd : SimdOp::subps), tmp, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::cmpunordpd : SimdOp::cmpunordps), rd, tmp); + simdEmitSSEOp(compiler, (is64 ? OPCODE_AND_IMM(SimdOp::psrlq_i, 13) : OPCODE_AND_IMM(SimdOp::psrld_i, 10)), SimdOp::psrl_i_arg, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::andnpd : SimdOp::andnps), rd, tmp); +} + +static void simdEmitFloatMin(sljit_compiler* compiler, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, bool is64) { + sljit_s32 tmp = SLJIT_FR2; + + if (rd == rn || rd == rm) { + sljit_s32 src = (rd == rn) ? rm : rn; + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), tmp, src, rd); + simdEmitVexOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), rd, rd, src); + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, tmp, src, 0); + simdEmitSSEOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), tmp, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), rd, src); + } + } else { + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), tmp, rn, rm); + simdEmitVexOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), rd, rm, rn); + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, tmp, rn, 0); + simdEmitSSEOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), tmp, rm); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, rd, rm, 0); + simdEmitSSEOp(compiler, (is64 ? SimdOp::minpd : SimdOp::minps), rd, rn); + } + } + + simdEmitSSEOp(compiler, (is64 ? SimdOp::orpd : SimdOp::orps), tmp, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::cmpunordpd : SimdOp::cmpunordps), rd, tmp); + simdEmitSSEOp(compiler, (is64 ? SimdOp::orpd : SimdOp::orps), tmp, rd); + simdEmitSSEOp(compiler, (is64 ? OPCODE_AND_IMM(SimdOp::psrlq_i, 13) : OPCODE_AND_IMM(SimdOp::psrld_i, 10)), SimdOp::psrl_i_arg, rd); + simdEmitSSEOp(compiler, (is64 ? SimdOp::andnpd : SimdOp::andnps), rd, tmp); +} + static void simdEmitI64X2Mul(sljit_compiler* compiler, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm) { sljit_s32 tmp1 = SLJIT_FR2; @@ -1261,6 +1351,8 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F32X4LeOpcode: case ByteCode::F32X4PMinOpcode: case ByteCode::F32X4PMaxOpcode: + case ByteCode::F32X4MaxOpcode: + case ByteCode::F32X4MinOpcode: srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; break; @@ -1279,6 +1371,8 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F64X2LeOpcode: case ByteCode::F64X2PMinOpcode: case ByteCode::F64X2PMaxOpcode: + case ByteCode::F64X2MaxOpcode: + case ByteCode::F64X2MinOpcode: srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; break; @@ -1580,6 +1674,14 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F32X4PMaxOpcode: simdEmitPMinMax(compiler, 0, dst, args[0].arg, args[1].arg); break; + case ByteCode::F32X4MaxOpcode: { + simdEmitFloatMax(compiler, dst, args[0].arg, args[1].arg, false); + break; + } + case ByteCode::F32X4MinOpcode: { + simdEmitFloatMin(compiler, dst, args[0].arg, args[1].arg, false); + break; + } case ByteCode::F64X2AddOpcode: simdEmitOp(compiler, SimdOp::addpd, dst, args[0].arg, args[1].arg); break; @@ -1619,6 +1721,12 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F64X2PMaxOpcode: simdEmitPMinMax(compiler, SimdOp::is64, dst, args[0].arg, args[1].arg); break; + case ByteCode::F64X2MaxOpcode: + simdEmitFloatMax(compiler, dst, args[0].arg, args[1].arg, true); + break; + case ByteCode::F64X2MinOpcode: + simdEmitFloatMin(compiler, dst, args[0].arg, args[1].arg, true); + break; case ByteCode::V128AndOpcode: simdEmitOp(compiler, SimdOp::pand, dst, args[0].arg, args[1].arg); break;