From 2f36a6665cc5ad0aeec49d6cce9425830d1cee41 Mon Sep 17 00:00:00 2001 From: gdkchan Date: Thu, 12 Sep 2024 18:22:30 -0300 Subject: [PATCH] Implement Arm32 VSHLL and QADD16 instructions (#7301) --- src/ARMeilleure/Decoders/OpCodeTable.cs | 2 + src/ARMeilleure/Instructions/InstEmitAlu32.cs | 113 ++++++++++++++++++ .../Instructions/InstEmitSimdShift32.cs | 32 +++++ src/ARMeilleure/Instructions/InstName.cs | 1 + src/Ryujinx.Tests/Cpu/CpuTestAlu32.cs | 1 + src/Ryujinx.Tests/Cpu/CpuTestSimd32.cs | 23 ++++ 6 files changed, 172 insertions(+) diff --git a/src/ARMeilleure/Decoders/OpCodeTable.cs b/src/ARMeilleure/Decoders/OpCodeTable.cs index 8595356704df..20d567fe5998 100644 --- a/src/ARMeilleure/Decoders/OpCodeTable.cs +++ b/src/ARMeilleure/Decoders/OpCodeTable.cs @@ -746,6 +746,7 @@ static OpCodeTable() SetA32("<<<<01101000xxxxxxxxxxxxxx01xxxx", InstName.Pkh, InstEmit32.Pkh, OpCode32AluRsImm.Create); SetA32("11110101xx01xxxx1111xxxxxxxxxxxx", InstName.Pld, InstEmit32.Nop, OpCode32.Create); SetA32("11110111xx01xxxx1111xxxxxxx0xxxx", InstName.Pld, InstEmit32.Nop, OpCode32.Create); + SetA32("<<<<01100010xxxxxxxx11110001xxxx", InstName.Qadd16, InstEmit32.Qadd16, OpCode32AluReg.Create); SetA32("<<<<011011111111xxxx11110011xxxx", InstName.Rbit, InstEmit32.Rbit, OpCode32AluReg.Create); SetA32("<<<<011010111111xxxx11110011xxxx", InstName.Rev, InstEmit32.Rev, OpCode32AluReg.Create); SetA32("<<<<011010111111xxxx11111011xxxx", InstName.Rev16, InstEmit32.Rev16, OpCode32AluReg.Create); @@ -1034,6 +1035,7 @@ static OpCodeTable() SetAsimd("111100101x>>>xxxxxxx0101>xx1xxxx", InstName.Vshl, InstEmit32.Vshl, OpCode32SimdShImm.Create, OpCode32SimdShImm.CreateT32); SetAsimd("1111001x0xxxxxxxxxxx0100xxx0xxxx", InstName.Vshl, InstEmit32.Vshl_I, OpCode32SimdReg.Create, OpCode32SimdReg.CreateT32); SetAsimd("1111001x1x>>>xxxxxxx101000x1xxxx", InstName.Vshll, InstEmit32.Vshll, OpCode32SimdShImmLong.Create, OpCode32SimdShImmLong.CreateT32); // A1 encoding. + SetAsimd("111100111x11<<10xxxx001100x0xxxx", InstName.Vshll, InstEmit32.Vshll2, OpCode32SimdMovn.Create, OpCode32SimdMovn.CreateT32); // A2 encoding. SetAsimd("1111001x1x>>>xxxxxxx0000>xx1xxxx", InstName.Vshr, InstEmit32.Vshr, OpCode32SimdShImm.Create, OpCode32SimdShImm.CreateT32); SetAsimd("111100101x>>>xxxxxxx100000x1xxx0", InstName.Vshrn, InstEmit32.Vshrn, OpCode32SimdShImmNarrow.Create, OpCode32SimdShImmNarrow.CreateT32); SetAsimd("111100111x>>>xxxxxxx0101>xx1xxxx", InstName.Vsli, InstEmit32.Vsli_I, OpCode32SimdShImm.Create, OpCode32SimdShImm.CreateT32); diff --git a/src/ARMeilleure/Instructions/InstEmitAlu32.cs b/src/ARMeilleure/Instructions/InstEmitAlu32.cs index 9f419ba99e7d..8eabe093e852 100644 --- a/src/ARMeilleure/Instructions/InstEmitAlu32.cs +++ b/src/ARMeilleure/Instructions/InstEmitAlu32.cs @@ -292,6 +292,16 @@ public static void Pkh(ArmEmitterContext context) EmitAluStore(context, res); } + public static void Qadd16(ArmEmitterContext context) + { + OpCode32AluReg op = (OpCode32AluReg)context.CurrOp; + + SetIntA32(context, op.Rd, EmitSigned16BitPair(context, GetIntA32(context, op.Rn), GetIntA32(context, op.Rm), (d, n, m) => + { + EmitSaturateRange(context, d, context.Add(n, m), 16, unsigned: false, setQ: false); + })); + } + public static void Rbit(ArmEmitterContext context) { Operand m = GetAluM(context); @@ -976,6 +986,94 @@ void SetD(int part, Operand value) } } + private static void EmitSaturateRange(ArmEmitterContext context, Operand result, Operand value, uint saturateTo, bool unsigned, bool setQ = true) + { + Debug.Assert(saturateTo <= 32); + Debug.Assert(!unsigned || saturateTo < 32); + + if (!unsigned && saturateTo == 32) + { + // No saturation possible for this case. + + context.Copy(result, value); + + return; + } + else if (saturateTo == 0) + { + // Result is always zero if we saturate 0 bits. + + context.Copy(result, Const(0)); + + return; + } + + Operand satValue; + + if (unsigned) + { + // Negative values always saturate (to zero). + // So we must always ignore the sign bit when masking, so that the truncated value will differ from the original one. + + satValue = context.BitwiseAnd(value, Const((int)(uint.MaxValue >> (32 - (int)saturateTo)))); + } + else + { + satValue = context.ShiftLeft(value, Const(32 - (int)saturateTo)); + satValue = context.ShiftRightSI(satValue, Const(32 - (int)saturateTo)); + } + + // If the result is 0, the values are equal and we don't need saturation. + Operand lblNoSat = Label(); + context.BranchIfFalse(lblNoSat, context.Subtract(value, satValue)); + + // Saturate and set Q flag. + if (unsigned) + { + if (saturateTo == 31) + { + // Only saturation case possible when going from 32 bits signed to 32 or 31 bits unsigned + // is when the signed input is negative, as all positive values are representable on a 31 bits range. + + satValue = Const(0); + } + else + { + satValue = context.ShiftRightSI(value, Const(31)); + satValue = context.BitwiseNot(satValue); + satValue = context.ShiftRightUI(satValue, Const(32 - (int)saturateTo)); + } + } + else + { + if (saturateTo == 1) + { + satValue = context.ShiftRightSI(value, Const(31)); + } + else + { + satValue = Const(uint.MaxValue >> (33 - (int)saturateTo)); + satValue = context.BitwiseExclusiveOr(satValue, context.ShiftRightSI(value, Const(31))); + } + } + + if (setQ) + { + SetFlag(context, PState.QFlag, Const(1)); + } + + context.Copy(result, satValue); + + Operand lblExit = Label(); + context.Branch(lblExit); + + context.MarkLabel(lblNoSat); + + context.Copy(result, value); + + context.MarkLabel(lblExit); + } + private static void EmitSaturateUqadd(ArmEmitterContext context, Operand result, Operand value, uint saturateTo) { Debug.Assert(saturateTo <= 32); @@ -1053,6 +1151,21 @@ private static void EmitSaturateUqsub(ArmEmitterContext context, Operand result, context.MarkLabel(lblExit); } + private static Operand EmitSigned16BitPair(ArmEmitterContext context, Operand rn, Operand rm, Action elementAction) + { + Operand tempD = context.AllocateLocal(OperandType.I32); + + Operand tempN = context.SignExtend16(OperandType.I32, rn); + Operand tempM = context.SignExtend16(OperandType.I32, rm); + elementAction(tempD, tempN, tempM); + Operand tempD2 = context.ZeroExtend16(OperandType.I32, tempD); + + tempN = context.ShiftRightSI(rn, Const(16)); + tempM = context.ShiftRightSI(rm, Const(16)); + elementAction(tempD, tempN, tempM); + return context.BitwiseOr(tempD2, context.ShiftLeft(tempD, Const(16))); + } + private static Operand EmitUnsigned16BitPair(ArmEmitterContext context, Operand rn, Operand rm, Action elementAction) { Operand tempD = context.AllocateLocal(OperandType.I32); diff --git a/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs b/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs index e9e3b52b90c3..eb28a0c5af0c 100644 --- a/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs +++ b/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs @@ -106,6 +106,38 @@ public static void Vshll(ArmEmitterContext context) context.Copy(GetVecA32(op.Qd), res); } + public static void Vshll2(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, !op.U); + + if (op.Size == 2) + { + if (op.U) + { + me = context.ZeroExtend32(OperandType.I64, me); + } + else + { + me = context.SignExtend32(OperandType.I64, me); + } + } + + me = context.ShiftLeft(me, Const(8 << op.Size)); + + res = EmitVectorInsert(context, res, me, index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + public static void Vshr(ArmEmitterContext context) { OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; diff --git a/src/ARMeilleure/Instructions/InstName.cs b/src/ARMeilleure/Instructions/InstName.cs index ac85412d1bbd..74c33155b17b 100644 --- a/src/ARMeilleure/Instructions/InstName.cs +++ b/src/ARMeilleure/Instructions/InstName.cs @@ -527,6 +527,7 @@ enum InstName Pld, Pop, Push, + Qadd16, Rev, Revsh, Rsb, diff --git a/src/Ryujinx.Tests/Cpu/CpuTestAlu32.cs b/src/Ryujinx.Tests/Cpu/CpuTestAlu32.cs index 132ddfd0e861..1e66d8112e71 100644 --- a/src/Ryujinx.Tests/Cpu/CpuTestAlu32.cs +++ b/src/Ryujinx.Tests/Cpu/CpuTestAlu32.cs @@ -29,6 +29,7 @@ private static uint[] UQAddSub16() { return new[] { + 0xe6200f10u, // QADD16 R0, R0, R0 0xe6600f10u, // UQADD16 R0, R0, R0 0xe6600f70u, // UQSUB16 R0, R0, R0 }; diff --git a/src/Ryujinx.Tests/Cpu/CpuTestSimd32.cs b/src/Ryujinx.Tests/Cpu/CpuTestSimd32.cs index f843fd561a62..08202c9e174e 100644 --- a/src/Ryujinx.Tests/Cpu/CpuTestSimd32.cs +++ b/src/Ryujinx.Tests/Cpu/CpuTestSimd32.cs @@ -328,6 +328,29 @@ public void Vmovn_V([Range(0u, 3u)] uint rd, CompareAgainstUnicorn(); } + [Test, Pairwise, Description("VSHLL. {}, , #")] + public void Vshll([Values(0u, 2u)] uint rd, + [Values(1u, 0u)] uint rm, + [Values(0u, 1u, 2u)] uint size, + [Random(RndCnt)] ulong z, + [Random(RndCnt)] ulong a, + [Random(RndCnt)] ulong b) + { + uint opcode = 0xf3b20300u; // VSHLL.I8 Q0, D0, #8 + + opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1); + opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18); + opcode |= size << 18; + + V128 v0 = MakeVectorE0E1(z, z); + V128 v1 = MakeVectorE0E1(a, z); + V128 v2 = MakeVectorE0E1(b, z); + + SingleOpcode(opcode, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(); + } + [Test, Pairwise, Description("VSWP D0, D0")] public void Vswp([Values(0u, 1u)] uint rd, [Values(0u, 1u)] uint rm,