From 56cc5d9355b955c55379b31290a2000bdb902d34 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 4 Aug 2024 05:09:06 +0300 Subject: [PATCH] Initial PPU LLVM implementation for aarch64 --- rpcs3/Emu/CPU/CPUTranslator.h | 36 +++++++++ rpcs3/Emu/CPU/Hypervisor.h | 40 ++++++++++ rpcs3/Emu/Cell/PPUThread.cpp | 130 +++++++++++++++++++------------ rpcs3/Emu/Cell/PPUThread.h | 4 + rpcs3/Emu/Cell/PPUTranslator.cpp | 88 ++++++++++++++------- rpcs3/Emu/Cell/PPUTranslator.h | 3 + 6 files changed, 223 insertions(+), 78 deletions(-) create mode 100644 rpcs3/Emu/CPU/Hypervisor.h diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index dcce0bf02e..2cefe69a32 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -26,6 +26,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/InlineAsm.h" #ifdef _MSC_VER #pragma warning(pop) @@ -3898,4 +3899,39 @@ struct fmt_unveil } }; +// Inline assembly wrappers. +// TODO: Move these to proper location and replace macros with templates +static inline +llvm::InlineAsm* compile_inline_asm( + llvm::Type* returnType, + llvm::ArrayRef argTypes, + const std::string& code, + const std::string& constraints) +{ + const auto callSig = llvm::FunctionType::get(returnType, argTypes, false); + return llvm::InlineAsm::get(callSig, code, constraints, true, false); +} + +// Helper for ASM generation with dynamic number of arguments +#define LLVM_ASM(asm_, args, constraints, irb, ctx)\ + do {\ + std::vector _argTypes;\ + _argTypes.reserve(args.size());\ + for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\ + auto _returnType = llvm::Type::getVoidTy(ctx); \ + llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \ + auto _c = irb->CreateCall(_callee, args); \ + _c->addFnAttr(llvm::Attribute::AlwaysInline); \ + } while(0) + +// Helper for ASM generation with 0 args +#define LLVM_ASM_0(asm_, irb, ctx)\ + do {\ + const auto _voidTy = llvm::Type::getVoidTy(ctx); \ + auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \ + auto _c = irb->CreateCall(_callee); \ + _c->setTailCall(); \ + _c->addFnAttr(llvm::Attribute::AlwaysInline); \ + } while(0) + #endif diff --git a/rpcs3/Emu/CPU/Hypervisor.h b/rpcs3/Emu/CPU/Hypervisor.h new file mode 100644 index 0000000000..0d07897816 --- /dev/null +++ b/rpcs3/Emu/CPU/Hypervisor.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +namespace rpcs3 +{ + union alignas(16) hypervisor_context_t + { + u64 regs[16]; + + struct + { + u64 pc; + u64 sp; + + u64 x18; + u64 x19; + u64 x20; + u64 x21; + u64 x22; + u64 x23; + u64 x24; + u64 x25; + u64 x26; + u64 x27; + u64 x28; + u64 x29; + u64 x30; + + // x0-x17 unused + } aarch64; + + struct + { + u64 sp; + + // Other regs unused + } x86; + }; +} diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 21701389e3..61471ad2d4 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -222,7 +222,7 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", #endif // Save native stack pointer for longjmp emulation - c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)), x86::rsp); + c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp); // Initialize args c.mov(x86::r13, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); @@ -291,37 +291,48 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", // and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers // for AArch64 calling convention - // Save sp for native longjmp emulation - Label native_sp_offset = c.newLabel(); - c.ldr(a64::x10, arm::Mem(native_sp_offset)); - // sp not allowed to be used in load/stores directly - c.mov(a64::x15, a64::sp); - c.str(a64::x15, arm::Mem(args[0], a64::x10)); - - // Push callee saved registers to the stack + // Push callee saved registers to the hv context + // Assume our LLVM compiled code is unsafe and can clobber our stack. GHC on aarch64 treats stack as scratch. + // We also want to store the register context at a fixed place so we can read the hypervisor state from any lcoation. // We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B - c.sub(a64::sp, a64::sp, Imm(112)); - c.stp(a64::x18, a64::x19, arm::Mem(a64::sp)); - c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); - c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); - c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); - c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); - c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); - c.str(a64::x30, arm::Mem(a64::sp, 96)); + + // Pre-context save + // Layout: + // pc, sp + // x18, x19...x30 + // NOTE: Do not touch x19..x30 before saving the registers! + const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address + + // Sanity + ensure(hv_register_array_offset < 4096); // Imm10 + + c.mov(a64::x15, args[0]); + c.add(a64::x14, a64::x15, Imm(hv_register_array_offset)); // Per-thread context save + + c.adr(a64::x15, hv_ctx_pc); // x15 = pc + c.mov(a64::x13, a64::sp); // x16 = sp + + c.stp(a64::x15, a64::x13, arm::Mem(a64::x14)); + c.stp(a64::x18, a64::x19, arm::Mem(a64::x14, 16)); + c.stp(a64::x20, a64::x21, arm::Mem(a64::x14, 32)); + c.stp(a64::x22, a64::x23, arm::Mem(a64::x14, 48)); + c.stp(a64::x24, a64::x25, arm::Mem(a64::x14, 64)); + c.stp(a64::x26, a64::x27, arm::Mem(a64::x14, 80)); + c.stp(a64::x28, a64::x29, arm::Mem(a64::x14, 96)); + c.str(a64::x30, arm::Mem(a64::x14, 112)); // Load REG_Base - use absolute jump target to bypass rel jmp range limits - Label exec_addr = c.newLabel(); - c.ldr(a64::x19, arm::Mem(exec_addr)); + c.mov(a64::x19, Imm(reinterpret_cast(&vm::g_exec_addr))); c.ldr(a64::x19, arm::Mem(a64::x19)); // Load PPUThread struct base -> REG_Sp const arm::GpX ppu_t_base = a64::x20; c.mov(ppu_t_base, args[0]); // Load PC const arm::GpX pc = a64::x15; - Label cia_offset = c.newLabel(); const arm::GpX cia_addr_reg = a64::x11; // Load offset value - c.ldr(cia_addr_reg, arm::Mem(cia_offset)); + c.mov(cia_addr_reg, Imm(static_cast(::offset32(&ppu_thread::cia)))); // Load cia c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg)); // Multiply by 2 to index into ptr table @@ -343,44 +354,45 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", c.lsr(call_target, call_target, Imm(16)); // Load registers - Label base_addr = c.newLabel(); - c.ldr(a64::x22, arm::Mem(base_addr)); + c.mov(a64::x22, Imm(reinterpret_cast(&vm::g_base_addr))); c.ldr(a64::x22, arm::Mem(a64::x22)); - Label gpr_addr_offset = c.newLabel(); const arm::GpX gpr_addr_reg = a64::x9; - c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset)); + c.mov(gpr_addr_reg, Imm(static_cast(::offset32(&ppu_thread::gpr)))); c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base); c.ldr(a64::x23, arm::Mem(gpr_addr_reg)); c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8)); c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16)); + // GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code. + // Injected stack frames also work, but are not free and are completely unnecessary. + c.sub(a64::sp, a64::sp, Imm(4096)); + // Execute LLE call c.blr(call_target); - // Restore registers from the stack - c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp)); - c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); - c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); - c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); - c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); - c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); - c.ldr(a64::x30, arm::Mem(a64::sp, 96)); - // Restore stack ptr - c.add(a64::sp, a64::sp, Imm(112)); - // Return - c.ret(a64::x30); + // Return address after far jump. Reset sp and start unwinding... + c.bind(hv_ctx_pc); - c.bind(exec_addr); - c.embedUInt64(reinterpret_cast(&vm::g_exec_addr)); - c.bind(base_addr); - c.embedUInt64(reinterpret_cast(&vm::g_base_addr)); - c.bind(cia_offset); - c.embedUInt64(static_cast(::offset32(&ppu_thread::cia))); - c.bind(gpr_addr_offset); - c.embedUInt64(static_cast(::offset32(&ppu_thread::gpr))); - c.bind(native_sp_offset); - c.embedUInt64(static_cast(::offset32(&ppu_thread::saved_native_sp))); + // Execution guard undo (unneded since we're going to hard-reset the SP) + //c.add(a64::sp, a64::sp, Imm(4096)); + + // We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg + // Either way, x20 contains our thread base and we forcefully reset the stack pointer + c.add(a64::x14, a64::x20, Imm(hv_register_array_offset)); // Per-thread context save + + c.ldr(a64::x15, arm::Mem(a64::x14, 8)); + c.ldp(a64::x18, a64::x19, arm::Mem(a64::x14, 16)); + c.ldp(a64::x20, a64::x21, arm::Mem(a64::x14, 32)); + c.ldp(a64::x22, a64::x23, arm::Mem(a64::x14, 48)); + c.ldp(a64::x24, a64::x25, arm::Mem(a64::x14, 64)); + c.ldp(a64::x26, a64::x27, arm::Mem(a64::x14, 80)); + c.ldp(a64::x28, a64::x29, arm::Mem(a64::x14, 96)); + c.ldr(a64::x30, arm::Mem(a64::x14, 112)); + + // Return + c.mov(a64::sp, a64::x15); + c.ret(a64::x30); #endif }); @@ -390,11 +402,20 @@ const extern auto ppu_escape = build_function_asm("ppu_esc #if defined(ARCH_X64) // Restore native stack pointer (longjmp emulation) - c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp))); + c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs))); // Return to the return location c.sub(x86::rsp, 8); c.ret(); +#else + // We really shouldn't be using this, but an implementation shoudln't hurt + // Far jump return. Only clobbers x30. + const arm::GpX ppu_t_base = a64::x20; + const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + c.mov(ppu_t_base, args[0]); + c.mov(a64::x30, Imm(hv_register_array_offset)); + c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30)); + c.ret(a64::x30); #endif }); @@ -2265,6 +2286,9 @@ void ppu_thread::exec_task() { if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) { + // HVContext push to allow recursion. This happens with guest callback invocations. + const auto old_hv_ctx = hv_ctx; + while (true) { if (state) [[unlikely]] @@ -2276,6 +2300,8 @@ void ppu_thread::exec_task() ppu_gateway(this); } + // HVContext pop + hv_ctx = old_hv_ctx; return; } @@ -2314,6 +2340,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3 { prio.raw().prio = _prio; + memset(&hv_ctx, 0, sizeof(hv_ctx)); + gpr[1] = stack_addr + stack_size - ppu_stack_start_offset; gpr[13] = param.tls_addr; @@ -3502,7 +3530,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) if (notify) { - bool notified = false; +bool notified = false; if (ppu.res_notify_time == (vm::reservation_acquire(notify) & -128)) { @@ -5277,12 +5305,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co // Translate if (const auto func = translator.Translate(module_part.funcs[fi])) { +#ifdef ARCH_X64 // TODO // Run optimization passes #if LLVM_VERSION_MAJOR < 17 pm.run(*func); #else fpm.run(*func, fam); #endif +#endif // ARCH_X64 } else { @@ -5297,12 +5327,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co { if (const auto func = translator.GetSymbolResolver(whole_module)) { +#ifdef ARCH_X64 // TODO // Run optimization passes #if LLVM_VERSION_MAJOR < 17 pm.run(*func); #else fpm.run(*func, fam); #endif +#endif // ARCH_X64 } else { diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h index 903262652d..86b6ebfa47 100644 --- a/rpcs3/Emu/Cell/PPUThread.h +++ b/rpcs3/Emu/Cell/PPUThread.h @@ -1,6 +1,7 @@ #pragma once #include "../CPU/CPUThread.h" +#include "../CPU/Hypervisor.h" #include "../Memory/vm_ptr.h" #include "Utilities/lockless.h" #include "Utilities/BitField.h" @@ -163,6 +164,9 @@ public: using cpu_thread::operator=; + // Hypervisor context data + alignas(16) rpcs3::hypervisor_context_t hv_ctx; // HV context for gate enter exit. Keep at a low struct offset. + u64 gpr[32] = {}; // General-Purpose Registers f64 fpr[32] = {}; // Floating Point Registers v128 vr[32] = {}; // Vector Registers diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index b16772df40..521077ac1c 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -208,8 +208,7 @@ Function* PPUTranslator::Translate(const ppu_function& info) m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, ptr, m_ir->getInt32((+cpu_flag::wait).operator u32()), llvm::MaybeAlign{4}, llvm::AtomicOrdering::AcquireRelease); // Create tail call to the check function - Call(GetType(), "__check", m_thread, GetAddr())->setTailCall(); - m_ir->CreateRetVoid(); + VMEscape(Call(GetType(), "__check", m_thread, GetAddr())); } else { @@ -321,7 +320,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) if (vec_addrs.empty()) { // Possible special case for no functions (allowing the do-while optimization) - m_ir->CreateRetVoid(); + m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path. replace_intrinsics(*m_function); return m_function; } @@ -378,7 +377,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) // Set insertion point to afterloop_block m_ir->SetInsertPoint(after_loop); - m_ir->CreateRetVoid(); + m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here replace_intrinsics(*m_function); return m_function; @@ -482,8 +481,8 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect) if (_target >= u32{umax}) { - Call(GetType(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr(::narrow(m_addr + base))))); - m_ir->CreateRetVoid(); + auto c = Call(GetType(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr(::narrow(m_addr + base))))); + VMEscape(c); return; } else if (_target >= caddr && _target <= cend) @@ -565,7 +564,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect) const auto c = m_ir->CreateCall(callee, {m_exec, m_thread, seg0, m_base, GetGpr(0), GetGpr(1), GetGpr(2)}); c->setTailCallKind(llvm::CallInst::TCK_Tail); c->setCallingConv(CallingConv::GHC); - m_ir->CreateRetVoid(); + VMEscape(c); } Value* PPUTranslator::RegInit(Value*& local) @@ -779,8 +778,8 @@ void PPUTranslator::TestAborted() m_ir->SetInsertPoint(vcheck); // Create tail call to the check function - Call(GetType(), "__check", m_thread, GetAddr())->setTailCall(); - m_ir->CreateRetVoid(); + auto c = Call(GetType(), "__check", m_thread, GetAddr()); + VMEscape(c); m_ir->SetInsertPoint(body); } @@ -2206,16 +2205,14 @@ void PPUTranslator::SC(ppu_opcode_t op) if (index < 1024) { - Call(GetType(), fmt::format("%s", ppu_syscall_code(index)), m_thread); - //Call(GetType(), "__escape", m_thread)->setTailCall(); - m_ir->CreateRetVoid(); + auto c = Call(GetType(), fmt::format("%s", ppu_syscall_code(index)), m_thread); + VMEscape(c, true); return; } } - Call(GetType(), op.lev ? "__lv1call" : "__syscall", m_thread, num); - //Call(GetType(), "__escape", m_thread)->setTailCall(); - m_ir->CreateRetVoid(); + auto c = Call(GetType(), op.lev ? "__lv1call" : "__syscall", m_thread, num); + VMEscape(c, true); } void PPUTranslator::B(ppu_opcode_t op) @@ -2776,9 +2773,9 @@ void PPUTranslator::LWARX(ppu_opcode_t op) { RegStore(Trunc(GetAddr()), m_cia); FlushRegisters(); - Call(GetType(), "__resinterp", m_thread); - //Call(GetType(), "__escape", m_thread)->setTailCall(); - m_ir->CreateRetVoid(); + + auto inst = Call(GetType(), "__resinterp", m_thread); + VMEscape(inst, true); return; } @@ -2928,9 +2925,9 @@ void PPUTranslator::LDARX(ppu_opcode_t op) { RegStore(Trunc(GetAddr()), m_cia); FlushRegisters(); - Call(GetType(), "__resinterp", m_thread); - //Call(GetType(), "__escape", m_thread)->setTailCall(); - m_ir->CreateRetVoid(); + + auto inst = Call(GetType(), "__resinterp", m_thread); + VMEscape(inst, true); return; } @@ -4998,9 +4995,8 @@ void PPUTranslator::FCFID(ppu_opcode_t op) void PPUTranslator::UNK(ppu_opcode_t op) { FlushRegisters(); - Call(GetType(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode)); - //Call(GetType(), "__escape", m_thread)->setTailCall(); - m_ir->CreateRetVoid(); + auto c = Call(GetType(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode)); + VMEscape(c, true); } @@ -5279,9 +5275,8 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right) void PPUTranslator::Trap() { - Call(GetType(), "__trap", m_thread, GetAddr()); - //Call(GetType(), "__escape", m_thread)->setTailCall(); - m_ir->CreateRetVoid(); + auto c = Call(GetType(), "__trap", m_thread, GetAddr()); + VMEscape(c); } Value* PPUTranslator::CheckBranchCondition(u32 bo, u32 bi) @@ -5328,6 +5323,42 @@ MDNode* PPUTranslator::CheckBranchProbability(u32 bo) return nullptr; } +void PPUTranslator::VMEscape([[maybe_unused]] llvm::CallInst* tail_call, [[maybe_unused]] bool skip_flush) +{ + //if (!skip_flush) + { + // Flush + FlushRegisters(); + } + +#ifdef ARCH_X64 + // Optionally flag last call as a tail + if (tail_call) + { + tail_call->setTailCall(); + } + + // This is actually AMD64 specific but good enough for now + m_ir->CreateRetVoid(); +#else + + // Validation. Make sure we're escaping from a correct context. Only guest JIT should ever go through the "escape" gate. + const auto bb = m_ir->GetInsertPoint(); + const auto arg = llvm::dyn_cast(m_thread); + ensure(bb->getParent()->getName().str() == arg->getParent()->getName().str()); + + const u32 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs); + const std::string asm_ = fmt::format( + "ldr x20, $0;\n" + "ldr x30, [x20, #%u];\n", + hv_register_array_offset); + + LLVM_ASM(asm_, std::array{ m_thread }, "m", m_ir, m_function->getContext()); + m_ir->CreateRetVoid(); + +#endif +} + void PPUTranslator::build_interpreter() { #define BUILD_VEC_INST(i) { \ @@ -5343,8 +5374,7 @@ void PPUTranslator::build_interpreter() op.vb = 2; \ op.vc = 3; \ this->i(op); \ - FlushRegisters(); \ - m_ir->CreateRetVoid(); \ + VMEscape(); \ replace_intrinsics(*m_function); \ } diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h index a71e42a033..f854297b3d 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.h +++ b/rpcs3/Emu/Cell/PPUTranslator.h @@ -150,6 +150,9 @@ public: // Emit function call void CallFunction(u64 target, llvm::Value* indirect = nullptr); + // Emit escape sequence back to hypervisor + void VMEscape(llvm::CallInst* tail_call = nullptr, bool skip_flush = false); + // Emit state check mid-block void TestAborted();