From a976ac33538792718b0d9f6a0b5f40bdb99682f5 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 4 Aug 2024 18:15:23 +0300 Subject: [PATCH] jit: Add aarch64 JIT backend for pre-codegen transforms --- rpcs3/Emu/CMakeLists.txt | 1 + rpcs3/Emu/CPU/Backends/AArch64JIT.cpp | 347 ++++++++++++++++++++++++++ rpcs3/Emu/CPU/Backends/AArch64JIT.h | 71 ++++++ rpcs3/Emu/CPU/CPUTranslator.cpp | 11 + rpcs3/Emu/CPU/CPUTranslator.h | 24 +- rpcs3/Emu/Cell/PPUTranslator.cpp | 8 +- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 4 +- 7 files changed, 459 insertions(+), 7 deletions(-) create mode 100644 rpcs3/Emu/CPU/Backends/AArch64JIT.cpp create mode 100644 rpcs3/Emu/CPU/Backends/AArch64JIT.h diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 3fafe5c247..ecb9bdf7ef 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -387,6 +387,7 @@ target_link_libraries(rpcs3_emu target_sources(rpcs3_emu PRIVATE CPU/CPUThread.cpp CPU/CPUTranslator.cpp + CPU/Backends/AArch64JIT.cpp ) target_link_libraries(rpcs3_emu diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp new file mode 100644 index 0000000000..e2a2e0ef93 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp @@ -0,0 +1,347 @@ +#include "stdafx.h" +#include "AArch64JIT.h" +#include "../Hypervisor.h" + +namespace aarch64 +{ + // FIXME: This really should be part of fmt + static std::string join_strings(const std::vector& v, const char* delim) + { + std::string result; + for (const auto& s : v) + { + if (!result.empty()) + { + result += delim; + } + result += s; + } + return result; + } + + using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t; + using function_info_t = GHC_frame_preservation_pass::function_info_t; + + GHC_frame_preservation_pass::GHC_frame_preservation_pass( + gprs base_reg, + u32 hv_ctx_offset, + std::function exclusion_callback) + { + execution_context.base_register = base_reg; + execution_context.hypervisor_context_offset = hv_ctx_offset; + this->exclusion_callback = exclusion_callback; + } + + void GHC_frame_preservation_pass::reset() + { + visited_functions.clear(); + } + + void GHC_frame_preservation_pass::force_tail_call_terminators(llvm::Function& f) + { + // GHC functions are not call-stack preserving and can therefore never return if they make any external calls at all. + // Replace every terminator clause with a tail call explicitly. This is already done for X64 to work, but better safe than sorry. + for (auto& bb : f) + { + auto bit = bb.begin(), prev = bb.end(); + for (; bit != bb.end(); prev = bit, ++bit) + { + if (prev == bb.end()) + { + continue; + } + + if (auto ri = llvm::dyn_cast(&*bit)) + { + if (auto ci = llvm::dyn_cast(&*prev)) + { + // This is a "ret" that is coming after a "call" to another funciton. + // Enforce that it must be a tail call. + if (!ci->isTailCall()) + { + ci->setTailCall(); + } + } + } + } + } + } + + function_info_t GHC_frame_preservation_pass::preprocess_function(llvm::Function& f) + { + function_info_t result{}; + result.instruction_count = f.getInstructionCount(); + + // Blanket exclusions. Stubs or dispatchers that do not compute anything themselves. + if (f.getName() == "__spu-null") + { + // Don't waste the effort processing this stub. It has no points of concern + return result; + } + + // Stack frame estimation. SPU code can be very long and consumes several KB of stack. + u32 stack_frame_size = 128u; + // Actual ratio is usually around 1:4 + const u32 expected_compiled_instr_count = f.getInstructionCount() * 4; + // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs. + // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions. + // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead. + // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level. + const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u; + stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes + + result.stack_frame_size = stack_frame_size; + result.instruction_count = f.getInstructionCount(); + result.num_external_calls = 0; + + // The LR is not spared by LLVM in cases where there is a lot of spilling. + // This is another thing to be moved to a MachineFunction pass. + result.clobbers_x30 = result.instruction_count > 32; + + for (auto& bb : f) + { + for (auto& inst : bb) + { + if (auto ci = llvm::dyn_cast(&inst)) + { + result.num_external_calls++; + result.clobbers_x30 |= (!ci->isTailCall()); + } + } + } + + return result; + } + + instruction_info_t GHC_frame_preservation_pass::decode_instruction(llvm::Function& f, llvm::Instruction* i) + { + instruction_info_t result{}; + if (auto ci = llvm::dyn_cast(i)) + { + result.is_call_inst = true; + result.is_returning = true; + result.preserve_stack = !ci->isTailCall(); + result.callee = ci->getCalledFunction(); + result.is_tail_call = ci->isTailCall(); + + if (!result.callee) + { + // TODO: What are these?????? Patchpoints maybe? Need to check again + result.is_call_inst = f.getName() == "__spu-null"; + } + else + { + result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC; + } + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + // More likely to jump out via an unconditional... + if (!bi->isConditional()) + { + ensure(bi->getNumSuccessors() == 1); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.is_call_inst = result.callee->getName() != f.getName(); + } + + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + // Very unlikely to be the same function. Can be considered a function exit. + ensure(bi->getNumDestinations() == 1); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.is_call_inst = result.callee->getName() != f.getName(); + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + ensure(bi->getNumSuccessors() == 1); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.is_call_inst = result.callee->getName() != f.getName(); + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + ensure(bi->getNumSuccessors() == 2); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.is_call_inst = result.callee->getName() != f.getName(); + return result; + } + + return result; + } + + void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f) + { + if (f.getCallingConv() != llvm::CallingConv::GHC) + { + // If we're not doing GHC, the calling conv will have stack fixup on its own via prologue/epilogue + return; + } + + if (f.getInstructionCount() == 0) + { + // Nothing to do. Happens with placeholder functions such as branch patchpoints + return; + } + + const auto this_name = f.getName().str(); + if (exclusion_callback && exclusion_callback(this_name)) + { + // Function is explicitly excluded + return; + } + + // Preprocessing. + auto function_info = preprocess_function(f); + if (function_info.num_external_calls == 0 && function_info.stack_frame_size == 0) + { + // No stack frame injection and no external calls to patch up. This is a leaf function, nothing to do. + return; + } + + // Force tail calls on all terminators + force_tail_call_terminators(f); + + // Asm snippets for patching stack frame + std::string frame_prologue, frame_epilogue; + + // Return address reload on exit. This is safer than trying to stuff things into the stack frame since the size is largely just guesswork at this time. + std::string x30_tail_restore = fmt::format( + "mov x30, #%u;\n" // Load offset to last gateway exit + "add x30, x%u, x30;\n" // Add to base register + "ldr x30, [x30];\n", // Load x30 + execution_context.hypervisor_context_offset, + execution_context.base_register); + + if (function_info.stack_frame_size > 0) + { + // NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway. + // However, that is an optimization for another time, this helps make debugging easier. + frame_prologue = fmt::format("sub sp, sp, #%u;", function_info.stack_frame_size); + frame_epilogue = fmt::format("add sp, sp, #%u;", function_info.stack_frame_size); + + // Emit the frame prologue + LLVM_ASM_0(frame_prologue, irb, f.getContext()); + } + + // Now we start processing + bool terminator_found = false; + for (auto& bb : f) + { + for (auto bit = bb.begin(); bit != bb.end();) + { + const auto instruction_info = decode_instruction(f, &(*bit)); + if (!instruction_info.is_call_inst) + { + ++bit; + continue; + } + + std::string callee_name = "__unknown"; + if (const auto cf = instruction_info.callee) + { + callee_name = cf->getName().str(); + if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm.")) + { + // Always inlined call. Likely inline Asm. Skip + // log("Function %s will ignore call to intrinsic function %s\n", this_name.c_str(), callee_name.c_str()); + ++bit; + continue; + } + + // Technically We should also ignore any host functions linked in, usually starting with ppu_ or spu_ prefix. + // However, there is not much guarantee that those are safe with only rare exceptions, and it doesn't hurt to patch the frame around them that much anyway. + } + + terminator_found |= instruction_info.is_tail_call; + + if (!instruction_info.preserve_stack) + { + // Now we patch the call if required. For normal calls that 'return' (i.e calls to C/C++ ABI), we do not patch them as they will manage the stack themselves (callee-managed) + llvm::Instruction* original_inst = llvm::dyn_cast(bit); + irb->SetInsertPoint(ensure(llvm::dyn_cast(bit))); + + if (function_info.stack_frame_size > 0) + { + // 1. Nuke all scratch + LLVM_ASM_0(frame_epilogue, irb, f.getContext()); + } + + if (function_info.clobbers_x30) + { + // 2. Restore the gateway as the current return address + LLVM_ASM_0(x30_tail_restore, irb, f.getContext()); + } + + // 3. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only. + // Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting. + // This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated. + // No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away. + + std::string exit_fn; + auto ci = ensure(llvm::dyn_cast(original_inst)); + auto operand_count = ci->getNumOperands(); + std::vector constraints; + std::vector args; + + // We now load the callee args. + // FIXME: This is often times redundant and wastes cycles, we'll clean this up in a MachineFunction pass later. + int base_reg = execution_context.base_register; + for (unsigned i = 0; i < operand_count; ++i) + { + args.push_back(ci->getOperand(i)); + exit_fn += fmt::format("mov x%d, $%u;\n", base_reg++, i); + constraints.push_back("r"); + } + + std::copy(ci->operands().begin(), ci->operands().end(), args.begin()); + auto target = ensure(ci->getCalledOperand()); + args.push_back(target); + + if (ci->isIndirectCall()) + { + constraints.push_back("r"); + exit_fn += fmt::format( + "mov x15, $%u;\n" + "br x15", + operand_count); + } + else + { + constraints.push_back("i"); + exit_fn += fmt::format("b $%u;\n", operand_count); + } + + // Emit the branch + LLVM_ASM(exit_fn, args, join_strings(constraints, ","), irb, f.getContext()); + + // Delete original call instruction + bit = ci->eraseFromParent(); + } + + // Next + if (bit != bb.end()) + { + ++bit; + } + } + } + + ensure(terminator_found, "Could not find terminator for function!"); + } +} diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64JIT.h new file mode 100644 index 0000000000..77ec184184 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.h @@ -0,0 +1,71 @@ +#pragma once + +#ifndef ARCH_ARM64 +#error "You have included an arm-only header" +#endif + +#include +#include "../CPUTranslator.h" + +#include + +namespace aarch64 +{ + enum gprs : s32 + { + x0 = 0, + x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30 + }; + + // On non-x86 architectures GHC runs stackless. SP is treated as a pointer to scratchpad memory. + // This pass keeps this behavior intact while preserving the expectations of the host's C++ ABI. + class GHC_frame_preservation_pass : translator_pass + { + public: + struct function_info_t + { + u32 instruction_count; + u32 num_external_calls; + u32 stack_frame_size; // Guessing this properly is critical for vector-heavy functions where spilling is a lot more common + bool clobbers_x30; + }; + + struct instruction_info_t + { + bool is_call_inst; // Is a function call. This includes a branch to external code. + bool preserve_stack; // Preserve the stack around this call. + bool is_returning; // This instruction "returns" to the next instruction (typically just llvm::CallInst*) + bool callee_is_GHC; // The other function is GHC + bool is_tail_call; // Tail call. Assume it is an exit/terminator. + llvm::Function* callee; // Callee if any + }; + protected: + std::unordered_set visited_functions; + + struct + { + gprs base_register; + u32 hypervisor_context_offset; + } execution_context; + + std::function exclusion_callback; + + void force_tail_call_terminators(llvm::Function& f); + + function_info_t preprocess_function(llvm::Function& f); + + instruction_info_t decode_instruction(llvm::Function& f, llvm::Instruction* i); + public: + + GHC_frame_preservation_pass( + gprs base_reg, + u32 hv_ctx_offset, + std::function exclusion_callback = {}); + ~GHC_frame_preservation_pass() = default; + + void run(llvm::IRBuilder<>* irb, llvm::Function& f) override; + void reset() override; + }; +} diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp index 28bc0fc3e5..8d101fca11 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.cpp +++ b/rpcs3/Emu/CPU/CPUTranslator.cpp @@ -392,6 +392,17 @@ void cpu_translator::replace_intrinsics(llvm::Function& f) } } +void cpu_translator::run_transforms(llvm::Function& f) +{ + // This pass must run first because the other passes may depend on resolved names. + replace_intrinsics(f); + + for (auto& pass : m_transform_passes) + { + pass->run(m_ir, f); + } +} + void cpu_translator::erase_stores(llvm::ArrayRef args) { for (auto v : args) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 2cefe69a32..a2aa9f7213 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3033,6 +3033,16 @@ struct llvm_calli } }; +class translator_pass +{ +public: + translator_pass() = default; + virtual ~translator_pass() {} + + virtual void run(llvm::IRBuilder<>* irb, llvm::Function& func) = 0; + virtual void reset() = 0; +}; + class cpu_translator { protected: @@ -3074,9 +3084,18 @@ protected: // IR builder llvm::IRBuilder<>* m_ir = nullptr; + // CUstomized transformation passes. Technically the intrinsics replacement belongs here. + std::vector> m_transform_passes; + void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine); public: + // Register a transformation pass to be run before final compilation by llvm + void register_transform_pass(std::unique_ptr& pass) + { + m_transform_passes.emplace_back(std::move(pass)); + } + // Convert a C++ type to an LLVM type (TODO: remove) template llvm::Type* GetType() @@ -3778,9 +3797,12 @@ public: } } - // Finalize processing custom intrinsics + // Run intrinsics replacement pass void replace_intrinsics(llvm::Function&); + // Finalize processing + void run_transforms(llvm::Function&); + // Erase store instructions of provided void erase_stores(llvm::ArrayRef args); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 521077ac1c..165955fe88 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -269,7 +269,7 @@ Function* PPUTranslator::Translate(const ppu_function& info) } } - replace_intrinsics(*m_function); + run_transforms(*m_function); return m_function; } @@ -321,7 +321,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) { // Possible special case for no functions (allowing the do-while optimization) m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path. - replace_intrinsics(*m_function); + run_transforms(*m_function); return m_function; } @@ -379,7 +379,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here - replace_intrinsics(*m_function); + run_transforms(*m_function); return m_function; } @@ -5375,7 +5375,7 @@ void PPUTranslator::build_interpreter() op.vc = 3; \ this->i(op); \ VMEscape(); \ - replace_intrinsics(*m_function); \ + run_transforms(*m_function); \ } BUILD_VEC_INST(VADDCUW); diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 4c704d5e2b..574d4acb93 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -2605,7 +2605,7 @@ public: for (auto& f : *m_module) { - replace_intrinsics(f); + run_transforms(f); } for (const auto& func : m_functions) @@ -3089,7 +3089,7 @@ public: for (auto& f : *_module) { - replace_intrinsics(f); + run_transforms(f); } std::string log;