Skip to content

Commit

Permalink
spu/arm64: clean up assembly code generation
Browse files Browse the repository at this point in the history
Clean up asmjit usage so we don't unnecessarily allocate memory
anymore for SPURecompiler functions.
  • Loading branch information
sguo35 committed Sep 5, 2022
1 parent ab31e96 commit ab99311
Showing 1 changed file with 178 additions and 104 deletions.
282 changes: 178 additions & 104 deletions rpcs3/Emu/Cell/SPURecompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,43 +1038,73 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)

auto make_jump = [&](asmjit::arm::CondCode op, auto target)
{
// 36 bytes
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);

// Build with asmjit to make things more readable
// Might cost some RAM, but meh whatever
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args)
// ldr x9, #16 -> ldr x9, taddr
*raw++ = 0x89;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;

if (op == asmjit::arm::CondCode::kAlways)
{
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;

// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;

// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
else
{
// b.COND #8 -> b.COND do_branch
switch (op)
{
using namespace asmjit;
case asmjit::arm::CondCode::kUnsignedLT:
*raw++ = 0x43;
break;
case asmjit::arm::CondCode::kUnsignedGT:
*raw++ = 0x48;
break;
default:
asm("brk 0x42");
}

c.movk(a64::x9, Imm(static_cast<u16>(taddr >> 48)), Imm(48));
c.movk(a64::x9, Imm(static_cast<u16>(taddr >> 32)), Imm(32));
c.movk(a64::x9, Imm(static_cast<u16>(taddr >> 16)), Imm(16));
c.movk(a64::x9, Imm(static_cast<u16>(taddr)), Imm(0));
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x54;

if (op == arm::CondCode::kAlways)
{
c.br(a64::x9);
// Constant length per jmp for easier stub patching
c.nop();
c.nop();
} else
{
Label do_branch = c.newLabel();
Label cont = c.newLabel();
// b #16 -> b cont
*raw++ = 0x04;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;

c.b(op, do_branch);
c.b(cont);
// do_branch: br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1f;
*raw++ = 0xD6;
}

c.bind(do_branch);
c.br(a64::x9);
// taddr
std::memcpy(raw, &taddr, 8);
raw += 8;

c.bind(cont);
}
});
u8 mem_used = 7 * 4;
memcpy(raw, reinterpret_cast<u8*>(temp), mem_used);
raw += mem_used;
// cont: next instruction
};
#elif defined(ARCH_X64)
// Allocate some writable executable memory
Expand Down Expand Up @@ -1203,17 +1233,7 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
// Rewrite jump address
{
u64 raw64 = reinterpret_cast<u64>(raw);
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args)
{
using namespace asmjit;

c.movk(a64::x9, Imm(static_cast<u16>(raw64 >> 48)), Imm(48));
c.movk(a64::x9, Imm(static_cast<u16>(raw64 >> 32)), Imm(32));
c.movk(a64::x9, Imm(static_cast<u16>(raw64 >> 16)), Imm(16));
c.movk(a64::x9, Imm(static_cast<u16>(raw64)), Imm(0));
});

memcpy(w.rel32 - (4 * 7), reinterpret_cast<u8*>(temp), 4 * 4);
memcpy(w.rel32 - 8, &raw64, 8);
}
#else
#error "Unimplemented"
Expand Down Expand Up @@ -1302,19 +1322,27 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
raw += 4;
}
#elif defined(ARCH_ARM64)
{
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args)
{
using namespace asmjit;

c.movz(a64::w9, Imm(static_cast<u16>(cmp_lsa >> 16)), Imm(16));
c.movk(a64::w9, Imm(static_cast<u16>(cmp_lsa)), Imm(0));
c.ldr(a64::w1, arm::Mem(a64::x7, a64::x9));
});
// ldr w9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x18;

// b #8
*raw++ = 0x02;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;

// cmp_lsa
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;

memcpy(raw, reinterpret_cast<u8*>(temp), 3 * 4);
raw += 3 * 4;
}
// ldr w1, [x7, x9]
*raw++ = 0xE1;
*raw++ = 0x68;
*raw++ = 0x69;
*raw++ = 0xB8;
#else
#error "Unimplemented"
#endif
Expand All @@ -1326,19 +1354,27 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
std::memcpy(raw, &x, 4);
raw += 4;
#elif defined(ARCH_ARM64)
{
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args)
{
using namespace asmjit;

c.movz(a64::w9, Imm(static_cast<u16>(x >> 16)), Imm(16));
c.movk(a64::w9, Imm(static_cast<u16>(x)), Imm(0));
c.cmp(a64::w1, a64::w9);
});
// ldr w9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x18;

// b #8
*raw++ = 0x02;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;

// x
std::memcpy(raw, &x, 4);
raw += 4;

memcpy(raw, reinterpret_cast<u8*>(temp), 3 * 4);
raw += 3 * 4;
}
// cmp w1, w9
*raw++ = 0x3f;
*raw++ = 0x00;
*raw++ = 0x09;
*raw++ = 0x6B;
#else
#error "Unimplemented"
#endif
Expand Down Expand Up @@ -1573,34 +1609,56 @@ spu_function_t spu_runtime::make_branch_patchpoint(u16 data) const

return reinterpret_cast<spu_function_t>(raw);
#elif defined(ARCH_ARM64)
spu_function_t func = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif

// Save the jmp addr to GHC CC 3rd arg -> REG_Hp
Label replace_addr = c.newLabel();
u8* const patch_fn = ensure(jit_runtime::alloc(36, 16));
u8* raw = patch_fn;

c.adr(a64::x21, replace_addr);
// 16 byte alignment for the jump replacement
c.nop();
c.nop();
c.nop();
Label branch_target = c.newLabel();
// adr x21, #16
*raw++ = 0x95;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x10;

c.bind(replace_addr);
c.ldr(a64::x9, arm::Mem(branch_target));
c.br(a64::x9);
// nop x3
for (int i = 0; i < 3; i++)
{
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}

c.bind(branch_target);
c.embedUInt64(reinterpret_cast<u64>(tr_branch));
// ldr x9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;

c.embedUInt8(data >> 8);
c.embedUInt8(data & 0xff);
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;

c.embed("branch_patchpoint", 17);
});
u64 branch_target = reinterpret_cast<u64>(tr_branch);
std::memcpy(raw, &branch_target, 8);
raw += 8;

*raw++ = static_cast<u8>(data >> 8);
*raw++ = static_cast<u8>(data & 0xff);

#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif

return func;
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");

return reinterpret_cast<spu_function_t>(patch_fn);
#else
#error "Unimplemented"
#endif
Expand Down Expand Up @@ -1636,18 +1694,26 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)

atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result);
#elif defined(ARCH_ARM64)
auto jump_instrs = build_function_asm<spu_function_t>("", [](native_asm& c, auto& args)
{
using namespace asmjit;
union
{
u8 bytes[16];
u128 result;
};

Label branch_target = c.newLabel();
c.ldr(a64::x9, arm::Mem(branch_target)); // PC rel load
c.br(a64::x9);
// ldr x9, #8
bytes[0] = 0x49;
bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;

c.bind(branch_target);
c.embedUInt64(reinterpret_cast<u64>(spu_runtime::tr_all));
});
u128 result = *reinterpret_cast<u128*>(jump_instrs);
// br x9
bytes[4] = 0x20;
bytes[5] = 0x01;
bytes[6] = 0x1F;
bytes[7] = 0xD6;

const u64 target = reinterpret_cast<u64>(spu_runtime::tr_all);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
Expand Down Expand Up @@ -1768,18 +1834,26 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)

atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
#elif defined(ARCH_ARM64)
auto jmp_instrs = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args)
{
using namespace asmjit;
union
{
u8 bytes[16];
u128 result;
};

Label branch_target = c.newLabel();
c.ldr(a64::x9, arm::Mem(branch_target)); // PC rel load
c.br(a64::x9);
// ldr x9, #8
bytes[0] = 0x49;
bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;

c.bind(branch_target);
c.embedUInt64(reinterpret_cast<u64>(func));
});
u128 result = *reinterpret_cast<u128*>(jmp_instrs);
// br x9
bytes[4] = 0x20;
bytes[5] = 0x01;
bytes[6] = 0x1F;
bytes[7] = 0xD6;

const u64 target = reinterpret_cast<u64>(func);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
Expand Down

0 comments on commit ab99311

Please sign in to comment.