From cba658babaf6211635d594d22b4cfed30f0bde1d Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 5 Aug 2024 06:17:33 +0300 Subject: [PATCH] Complete PPU support --- rpcs3/Emu/CPU/Backends/AArch64JIT.cpp | 139 ++++++++++++++++++-------- rpcs3/Emu/CPU/Backends/AArch64JIT.h | 8 +- rpcs3/Emu/CPU/CPUTranslator.h | 54 ++++++---- rpcs3/Emu/Cell/PPUThread.cpp | 21 +++- rpcs3/Emu/Cell/PPUTranslator.cpp | 24 +++-- 5 files changed, 174 insertions(+), 72 deletions(-) diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp index 47ee1a6232..6f486a578d 100644 --- a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp +++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp @@ -2,6 +2,21 @@ #include "AArch64JIT.h" #include "../Hypervisor.h" +LOG_CHANNEL(jit_log, "JIT"); + +#define STDOUT_DEBUG + +#ifndef STDOUT_DEBUG +#define DPRINT jit_log.trace +#else +#define DPRINT(...)\ + do {\ + printf(__VA_ARGS__);\ + printf("\n");\ + fflush(stdout);\ + } while (0) +#endif + namespace aarch64 { // FIXME: This really should be part of fmt @@ -23,11 +38,11 @@ namespace aarch64 using function_info_t = GHC_frame_preservation_pass::function_info_t; GHC_frame_preservation_pass::GHC_frame_preservation_pass( - gpr base_reg, u32 hv_ctx_offset, + const std::vector>& base_register_lookup, std::function exclusion_callback) { - execution_context.base_register = base_reg; + execution_context.base_register_lookup = base_register_lookup; execution_context.hypervisor_context_offset = hv_ctx_offset; this->exclusion_callback = exclusion_callback; } @@ -118,6 +133,13 @@ namespace aarch64 instruction_info_t result{}; if (auto ci = llvm::dyn_cast(i)) { + // Watch out for injected ASM blocks... + if (llvm::isa(ci->getCalledOperand())) + { + // Not a real call. This is just an insert of inline asm + return result; + } + result.is_call_inst = true; result.is_returning = true; result.preserve_stack = !ci->isTailCall(); @@ -126,12 +148,15 @@ namespace aarch64 if (!result.callee) { - // TODO: What are these?????? Patchpoints maybe? Need to check again - result.is_call_inst = f.getName() == "__spu-null"; + // Indirect call (call from raw value). + result.is_indirect = true; + result.callee_is_GHC = ci->getCallingConv() == llvm::CallingConv::GHC; + result.callee_name = "__indirect_call"; } else { result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC; + result.callee_name = result.callee->getName().str(); } return result; } @@ -145,7 +170,8 @@ namespace aarch64 auto targetbb = bi->getSuccessor(0); result.callee = targetbb->getParent(); - result.is_call_inst = result.callee->getName() != f.getName(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); } return result; @@ -155,10 +181,11 @@ namespace aarch64 { // Very unlikely to be the same function. Can be considered a function exit. ensure(bi->getNumDestinations() == 1); - auto targetbb = bi->getSuccessor(0); + auto targetbb = ensure(bi->getSuccessor(0)); // This is guaranteed to fail but I've yet to encounter this result.callee = targetbb->getParent(); - result.is_call_inst = result.callee->getName() != f.getName(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); return result; } @@ -168,7 +195,8 @@ namespace aarch64 auto targetbb = bi->getSuccessor(0); result.callee = targetbb->getParent(); - result.is_call_inst = result.callee->getName() != f.getName(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); return result; } @@ -178,13 +206,29 @@ namespace aarch64 auto targetbb = bi->getSuccessor(0); result.callee = targetbb->getParent(); - result.is_call_inst = result.callee->getName() != f.getName(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); return result; } return result; } + gpr GHC_frame_preservation_pass::get_base_register_for_call(const std::string& callee_name) + { + // We go over the base_register_lookup table and find the first matching pattern + for (const auto& pattern : execution_context.base_register_lookup) + { + if (callee_name.starts_with(pattern.first)) + { + return pattern.second; + } + } + + // Default is x19 + return aarch64::x19; + } + void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f) { if (f.getCallingConv() != llvm::CallingConv::GHC) @@ -200,6 +244,14 @@ namespace aarch64 } const auto this_name = f.getName().str(); + if (visited_functions.find(this_name) != visited_functions.end()) + { + // Already processed. Only useful when recursing which is currently not used. + DPRINT("Function %s was already processed. Skipping.\n", this_name.c_str()); + return; + } + visited_functions.insert(this_name); + if (exclusion_callback && exclusion_callback(this_name)) { // Function is explicitly excluded @@ -220,14 +272,6 @@ namespace aarch64 // Asm snippets for patching stack frame std::string frame_prologue, frame_epilogue; - // Return address reload on exit. This is safer than trying to stuff things into the stack frame since the size is largely just guesswork at this time. - std::string x30_tail_restore = fmt::format( - "mov x30, #%u;\n" // Load offset to last gateway exit - "add x30, x%u, x30;\n" // Add to base register - "ldr x30, [x30];\n", // Load x30 - execution_context.hypervisor_context_offset, - static_cast(execution_context.base_register)); - if (function_info.stack_frame_size > 0) { // NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway. @@ -235,8 +279,12 @@ namespace aarch64 frame_prologue = fmt::format("sub sp, sp, #%u;", function_info.stack_frame_size); frame_epilogue = fmt::format("add sp, sp, #%u;", function_info.stack_frame_size); - // Emit the frame prologue - LLVM_ASM_0(frame_prologue, irb, f.getContext()); + // Emit the frame prologue. We use a BB here for extra safety as it solves the problem of backwards jumps re-executing the prologue. + auto functionStart = &f.front(); + auto prologueBB = llvm::BasicBlock::Create(f.getContext(), "", &f, functionStart); + irb->SetInsertPoint(prologueBB, prologueBB->begin()); + LLVM_ASM_VOID(frame_prologue, irb, f.getContext()); + irb->CreateBr(functionStart); } // Now we start processing @@ -259,7 +307,6 @@ namespace aarch64 if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm.")) { // Always inlined call. Likely inline Asm. Skip - // log("Function %s will ignore call to intrinsic function %s\n", this_name.c_str(), callee_name.c_str()); ++bit; continue; } @@ -278,48 +325,62 @@ namespace aarch64 if (function_info.stack_frame_size > 0) { - // 1. Nuke all scratch - LLVM_ASM_0(frame_epilogue, irb, f.getContext()); + // 1. Nuke the local stack frame if any + LLVM_ASM_VOID(frame_epilogue, irb, f.getContext()); } - if (function_info.clobbers_x30) - { - // 2. Restore the gateway as the current return address - LLVM_ASM_0(x30_tail_restore, irb, f.getContext()); - } - - // 3. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only. + // 2. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only. // Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting. // This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated. // No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away. std::string exit_fn; auto ci = ensure(llvm::dyn_cast(original_inst)); - auto operand_count = ci->getNumOperands(); + auto operand_count = ci->getNumOperands() - 1; // The last operand is the callee, not a real operand std::vector constraints; std::vector args; // We now load the callee args. // FIXME: This is often times redundant and wastes cycles, we'll clean this up in a MachineFunction pass later. - int base_reg = execution_context.base_register; + int args_base_reg = instruction_info.callee_is_GHC ? aarch64::x19 : aarch64::x0; // GHC args are always x19..x25 for (unsigned i = 0; i < operand_count; ++i) { args.push_back(ci->getOperand(i)); - exit_fn += fmt::format("mov x%d, $%u;\n", base_reg++, i); + exit_fn += fmt::format("mov x%d, $%u;\n", args_base_reg++, i); constraints.push_back("r"); } - std::copy(ci->operands().begin(), ci->operands().end(), args.begin()); + auto context_base_reg = get_base_register_for_call(instruction_info.callee_name); + if (!instruction_info.callee_is_GHC) + { + // For non-GHC calls, we have to remap the arguments to x0... + context_base_reg = static_cast(context_base_reg - 19); + } + + if (function_info.clobbers_x30) + { + // 3. Restore the exit gate as the current return address + // We want to do this after loading the arguments in case there was any spilling involved. + DPRINT("Patching call from %s to %s on register %d...", + this_name.c_str(), + instruction_info.callee_name.c_str(), + static_cast(context_base_reg)); + + const auto x30_tail_restore = fmt::format( + "ldr x30, [x%u, #%u];\n", // Load x30 from thread context + static_cast(context_base_reg), + execution_context.hypervisor_context_offset); + + exit_fn += x30_tail_restore; + } + auto target = ensure(ci->getCalledOperand()); args.push_back(target); - if (ci->isIndirectCall()) + if (instruction_info.is_indirect) { constraints.push_back("r"); - exit_fn += fmt::format( - "mov x15, $%u;\n" - "br x15", - operand_count); + exit_fn += fmt::format("br $%u;\n", operand_count); } else { @@ -328,7 +389,7 @@ namespace aarch64 } // Emit the branch - LLVM_ASM(exit_fn, args, join_strings(constraints, ","), irb, f.getContext()); + llvm_asm(irb, exit_fn, args, join_strings(constraints, ","), f.getContext()); // Delete original call instruction bit = ci->eraseFromParent(); diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64JIT.h index 5cfe5eafd8..e5a8958d63 100644 --- a/rpcs3/Emu/CPU/Backends/AArch64JIT.h +++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.h @@ -39,14 +39,16 @@ namespace aarch64 bool is_returning; // This instruction "returns" to the next instruction (typically just llvm::CallInst*) bool callee_is_GHC; // The other function is GHC bool is_tail_call; // Tail call. Assume it is an exit/terminator. + bool is_indirect; // Indirect call. Target is the first operand. llvm::Function* callee; // Callee if any + std::string callee_name; // Name of the callee. }; protected: std::unordered_set visited_functions; struct { - gpr base_register; + std::vector> base_register_lookup; u32 hypervisor_context_offset; } execution_context; @@ -57,11 +59,13 @@ namespace aarch64 function_info_t preprocess_function(llvm::Function& f); instruction_info_t decode_instruction(llvm::Function& f, llvm::Instruction* i); + + gpr get_base_register_for_call(const std::string& callee_name); public: GHC_frame_preservation_pass( - gpr base_reg, u32 hv_ctx_offset, + const std::vector>& base_register_lookup = {}, std::function exclusion_callback = {}); ~GHC_frame_preservation_pass() = default; diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index a2aa9f7213..91d14033ac 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3089,6 +3089,9 @@ protected: void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine); + // Run intrinsics replacement pass + void replace_intrinsics(llvm::Function&); + public: // Register a transformation pass to be run before final compilation by llvm void register_transform_pass(std::unique_ptr& pass) @@ -3797,9 +3800,6 @@ public: } } - // Run intrinsics replacement pass - void replace_intrinsics(llvm::Function&); - // Finalize processing void run_transforms(llvm::Function&); @@ -3935,25 +3935,39 @@ llvm::InlineAsm* compile_inline_asm( } // Helper for ASM generation with dynamic number of arguments +static inline +llvm::CallInst* llvm_asm( + llvm::IRBuilder<>* irb, + std::string& asm_, + llvm::ArrayRef args, + const std::string& constraints, + llvm::LLVMContext& context) +{ + llvm::ArrayRef types_ref = std::nullopt; + std::vector types; + types.reserve(args.size()); + + if (!args.empty()) + { + for (const auto& arg : args) + { + types.push_back(arg->getType()); + } + types_ref = types; + } + + auto return_type = llvm::Type::getVoidTy(context); + auto callee = compile_inline_asm(return_type, types_ref, asm_, constraints); + auto c = irb->CreateCall(callee, args); + c->addFnAttr(llvm::Attribute::AlwaysInline); + return c; +} + #define LLVM_ASM(asm_, args, constraints, irb, ctx)\ - do {\ - std::vector _argTypes;\ - _argTypes.reserve(args.size());\ - for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\ - auto _returnType = llvm::Type::getVoidTy(ctx); \ - llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \ - auto _c = irb->CreateCall(_callee, args); \ - _c->addFnAttr(llvm::Attribute::AlwaysInline); \ - } while(0) + llvm_asm(irb, asm_, args, constraints, ctx) // Helper for ASM generation with 0 args -#define LLVM_ASM_0(asm_, irb, ctx)\ - do {\ - const auto _voidTy = llvm::Type::getVoidTy(ctx); \ - auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \ - auto _c = irb->CreateCall(_callee); \ - _c->setTailCall(); \ - _c->addFnAttr(llvm::Attribute::AlwaysInline); \ - } while(0) +#define LLVM_ASM_VOID(asm_, irb, ctx)\ + llvm_asm(irb, asm_, {}, "", ctx) #endif diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 595103173d..fff0aae0f9 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -371,8 +371,16 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8)); c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16)); - // GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code. - // Injected stack frames also work, but are not free and are completely unnecessary. + // Thread context save. This is needed for PPU because different functions can switch between x19 and x20 for the base register. + // We need a different solution to ensure that no matter which version, we get the right vaue on far return. + c.mov(a64::x26, ppu_t_base); + + // Save thread pointer to stack. SP is the only register preserved across GHC calls. + c.sub(a64::sp, a64::sp, Imm(16)); + c.str(a64::x20, arm::Mem(a64::sp)); + + // GHC scratchpad mem. If managed correctly (i.e no returns ever), GHC functions should never require a stack frame. + // We allocate a slab to use for all functions as they tail-call into each other. c.sub(a64::sp, a64::sp, Imm(4096)); // Execute LLE call @@ -381,11 +389,14 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", // Return address after far jump. Reset sp and start unwinding... c.bind(hv_ctx_pc); - // Execution guard undo (unneded since we're going to hard-reset the SP) - //c.add(a64::sp, a64::sp, Imm(4096)); + // Clear scratchpad allocation + c.add(a64::sp, a64::sp, Imm(4096)); + + c.ldr(a64::x20, arm::Mem(a64::sp)); + c.add(a64::sp, a64::sp, Imm(16)); // We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg - // Either way, x20 contains our thread base and we forcefully reset the stack pointer + // Either way, x26 contains our thread base and we forcefully reset the stack pointer c.add(a64::x14, a64::x20, Imm(hv_register_array_offset)); // Per-thread context save c.ldr(a64::x15, arm::Mem(a64::x14, 8)); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 1ac44770d1..c9040b1b4d 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -36,9 +36,21 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo // Initialize transform passes #ifdef ARCH_ARM64 - std::unique_ptr ghc_fixup_pass = std::make_unique( - aarch64::x20, ::offset32(&ppu_thread::hv_ctx)); + // Base reg table definition + // Assume all functions named __0x... are PPU functions and take the m_exec as the first arg + std::vector> base_reg_lookup = { + { "__0x", aarch64::x20 }, // PPU blocks + { "__indirect", aarch64::x20 }, // Indirect jumps + { "ppu_", aarch64::x19 }, // Fixed JIT helpers (e.g ppu_gateway) + { "__", aarch64::x19 } // Probably link table entries + }; + // Create transform pass + std::unique_ptr ghc_fixup_pass = std::make_unique( + ::offset32(&ppu_thread::hv_ctx), + base_reg_lookup); + + // Register it register_transform_pass(ghc_fixup_pass); #endif @@ -282,7 +294,7 @@ Function* PPUTranslator::Translate(const ppu_function& info) } } - replace_intrinsics(*m_function); + run_transforms(*m_function); return m_function; } @@ -334,7 +346,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) { // Possible special case for no functions (allowing the do-while optimization) m_ir->CreateRetVoid(); - replace_intrinsics(*m_function); + run_transforms(*m_function); return m_function; } @@ -392,7 +404,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) m_ir->CreateRetVoid(); - replace_intrinsics(*m_function); + run_transforms(*m_function); return m_function; } @@ -5357,7 +5369,7 @@ void PPUTranslator::build_interpreter() this->i(op); \ FlushRegisters(); \ m_ir->CreateRetVoid(); \ - replace_intrinsics(*m_function); \ + run_transforms(*m_function); \ } BUILD_VEC_INST(VADDCUW);