From a976ac33538792718b0d9f6a0b5f40bdb99682f5 Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Sun, 4 Aug 2024 18:15:23 +0300
Subject: [PATCH] jit: Add aarch64 JIT backend for pre-codegen transforms

---
 rpcs3/Emu/CMakeLists.txt              |   1 +
 rpcs3/Emu/CPU/Backends/AArch64JIT.cpp | 347 ++++++++++++++++++++++++++
 rpcs3/Emu/CPU/Backends/AArch64JIT.h   |  71 ++++++
 rpcs3/Emu/CPU/CPUTranslator.cpp       |  11 +
 rpcs3/Emu/CPU/CPUTranslator.h         |  24 +-
 rpcs3/Emu/Cell/PPUTranslator.cpp      |   8 +-
 rpcs3/Emu/Cell/SPULLVMRecompiler.cpp  |   4 +-
 7 files changed, 459 insertions(+), 7 deletions(-)
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64JIT.cpp
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64JIT.h

diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt
index 3fafe5c247..ecb9bdf7ef 100644
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@@ -387,6 +387,7 @@ target_link_libraries(rpcs3_emu
 target_sources(rpcs3_emu PRIVATE
     CPU/CPUThread.cpp
     CPU/CPUTranslator.cpp
+    CPU/Backends/AArch64JIT.cpp
 )
 
 target_link_libraries(rpcs3_emu
diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp
new file mode 100644
index 0000000000..e2a2e0ef93
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.cpp
@@ -0,0 +1,347 @@
+#include "stdafx.h"
+#include "AArch64JIT.h"
+#include "../Hypervisor.h"
+
+namespace aarch64
+{
+    // FIXME: This really should be part of fmt
+    static std::string join_strings(const std::vector<std::string>& v, const char* delim)
+    {
+        std::string result;
+        for (const auto& s : v)
+        {
+            if (!result.empty())
+            {
+                result += delim;
+            }
+            result += s;
+        }
+        return result;
+    }
+
+    using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t;
+    using function_info_t = GHC_frame_preservation_pass::function_info_t;
+
+    GHC_frame_preservation_pass::GHC_frame_preservation_pass(
+        gprs base_reg,
+        u32 hv_ctx_offset,
+        std::function<bool(const std::string&)> exclusion_callback)
+    {
+        execution_context.base_register = base_reg;
+        execution_context.hypervisor_context_offset = hv_ctx_offset;
+        this->exclusion_callback = exclusion_callback;
+    }
+
+    void GHC_frame_preservation_pass::reset()
+    {
+        visited_functions.clear();
+    }
+
+    void GHC_frame_preservation_pass::force_tail_call_terminators(llvm::Function& f)
+    {
+        // GHC functions are not call-stack preserving and can therefore never return if they make any external calls at all.
+        // Replace every terminator clause with a tail call explicitly. This is already done for X64 to work, but better safe than sorry.
+        for (auto& bb : f)
+        {
+            auto bit = bb.begin(), prev = bb.end();
+            for (; bit != bb.end(); prev = bit, ++bit)
+            {
+                if (prev == bb.end())
+                {
+                    continue;
+                }
+
+                if (auto ri = llvm::dyn_cast<llvm::ReturnInst>(&*bit))
+                {
+                    if (auto ci = llvm::dyn_cast<llvm::CallInst>(&*prev))
+                    {
+                        // This is a "ret" that is coming after a "call" to another funciton.
+                        // Enforce that it must be a tail call.
+                        if (!ci->isTailCall())
+                        {
+                            ci->setTailCall();
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    function_info_t GHC_frame_preservation_pass::preprocess_function(llvm::Function& f)
+    {
+        function_info_t result{};
+        result.instruction_count = f.getInstructionCount();
+
+        // Blanket exclusions. Stubs or dispatchers that do not compute anything themselves.
+        if (f.getName() == "__spu-null")
+        {
+            // Don't waste the effort processing this stub. It has no points of concern
+            return result;
+        }
+
+        // Stack frame estimation. SPU code can be very long and consumes several KB of stack.
+        u32 stack_frame_size = 128u;
+        // Actual ratio is usually around 1:4
+        const u32 expected_compiled_instr_count = f.getInstructionCount() * 4;
+        // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs.
+        // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions.
+        // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead.
+        // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level.
+        const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u;
+        stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes
+
+        result.stack_frame_size = stack_frame_size;
+        result.instruction_count = f.getInstructionCount();
+        result.num_external_calls = 0;
+
+        // The LR is not spared by LLVM in cases where there is a lot of spilling.
+        // This is another thing to be moved to a MachineFunction pass.
+        result.clobbers_x30 = result.instruction_count > 32;
+
+        for (auto& bb : f)
+        {
+            for (auto& inst : bb)
+            {
+                if (auto ci = llvm::dyn_cast<llvm::CallInst>(&inst))
+                {
+                    result.num_external_calls++;
+                    result.clobbers_x30 |= (!ci->isTailCall());
+                }
+            }
+        }
+
+        return result;
+    }
+
+    instruction_info_t GHC_frame_preservation_pass::decode_instruction(llvm::Function& f, llvm::Instruction* i)
+    {
+        instruction_info_t result{};
+        if (auto ci = llvm::dyn_cast<llvm::CallInst>(i))
+        {
+            result.is_call_inst = true;
+            result.is_returning = true;
+            result.preserve_stack = !ci->isTailCall();
+            result.callee = ci->getCalledFunction();
+            result.is_tail_call = ci->isTailCall();
+
+            if (!result.callee)
+            {
+                // TODO: What are these?????? Patchpoints maybe? Need to check again
+                result.is_call_inst = f.getName() == "__spu-null";
+            }
+            else
+            {
+                result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC;
+            }
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::BranchInst>(i))
+        {
+            // More likely to jump out via an unconditional...
+            if (!bi->isConditional())
+            {
+                ensure(bi->getNumSuccessors() == 1);
+                auto targetbb = bi->getSuccessor(0);
+
+                result.callee = targetbb->getParent();
+                result.is_call_inst = result.callee->getName() != f.getName();
+            }
+
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::IndirectBrInst>(i))
+        {
+            // Very unlikely to be the same function. Can be considered a function exit.
+            ensure(bi->getNumDestinations() == 1);
+            auto targetbb = bi->getSuccessor(0);
+
+            result.callee = targetbb->getParent();
+            result.is_call_inst = result.callee->getName() != f.getName();
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::CallBrInst>(i))
+        {
+            ensure(bi->getNumSuccessors() == 1);
+            auto targetbb = bi->getSuccessor(0);
+
+            result.callee = targetbb->getParent();
+            result.is_call_inst = result.callee->getName() != f.getName();
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::InvokeInst>(i))
+        {
+            ensure(bi->getNumSuccessors() == 2);
+            auto targetbb = bi->getSuccessor(0);
+
+            result.callee = targetbb->getParent();
+            result.is_call_inst = result.callee->getName() != f.getName();
+            return result;
+        }
+
+        return result;
+    }
+
+    void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f)
+    {
+        if (f.getCallingConv() != llvm::CallingConv::GHC)
+        {
+            // If we're not doing GHC, the calling conv will have stack fixup on its own via prologue/epilogue
+            return;
+        }
+
+        if (f.getInstructionCount() == 0)
+        {
+            // Nothing to do. Happens with placeholder functions such as branch patchpoints
+            return;
+        }
+
+        const auto this_name = f.getName().str();
+        if (exclusion_callback && exclusion_callback(this_name))
+        {
+            // Function is explicitly excluded
+            return;
+        }
+
+        // Preprocessing.
+        auto function_info = preprocess_function(f);
+        if (function_info.num_external_calls == 0 && function_info.stack_frame_size == 0)
+        {
+            // No stack frame injection and no external calls to patch up. This is a leaf function, nothing to do.
+            return;
+        }
+
+        // Force tail calls on all terminators
+        force_tail_call_terminators(f);
+
+        // Asm snippets for patching stack frame
+        std::string frame_prologue, frame_epilogue;
+
+        // Return address reload on exit. This is safer than trying to stuff things into the stack frame since the size is largely just guesswork at this time.
+        std::string x30_tail_restore = fmt::format(
+            "mov x30, #%u;\n"          // Load offset to last gateway exit
+            "add x30, x%u, x30;\n"     // Add to base register
+            "ldr x30, [x30];\n",       // Load x30
+            execution_context.hypervisor_context_offset,
+            execution_context.base_register);
+
+        if (function_info.stack_frame_size > 0)
+        {
+            // NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway.
+            // However, that is an optimization for another time, this helps make debugging easier.
+            frame_prologue = fmt::format("sub sp, sp, #%u;", function_info.stack_frame_size);
+            frame_epilogue = fmt::format("add sp, sp, #%u;", function_info.stack_frame_size);
+
+            // Emit the frame prologue
+            LLVM_ASM_0(frame_prologue, irb, f.getContext());
+        }
+
+        // Now we start processing
+        bool terminator_found = false;
+        for (auto& bb : f)
+        {
+            for (auto bit = bb.begin(); bit != bb.end();)
+            {
+                const auto instruction_info = decode_instruction(f, &(*bit));
+                if (!instruction_info.is_call_inst)
+                {
+                    ++bit;
+                    continue;
+                }
+
+                std::string callee_name = "__unknown";
+                if (const auto cf = instruction_info.callee)
+                {
+                    callee_name = cf->getName().str();
+                    if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm."))
+                    {
+                        // Always inlined call. Likely inline Asm. Skip
+                        // log("Function %s will ignore call to intrinsic function %s\n", this_name.c_str(), callee_name.c_str());
+                        ++bit;
+                        continue;
+                    }
+
+                    // Technically We should also ignore any host functions linked in, usually starting with ppu_ or spu_ prefix.
+                    // However, there is not much guarantee that those are safe with only rare exceptions, and it doesn't hurt to patch the frame around them that much anyway.
+                }
+
+                terminator_found |= instruction_info.is_tail_call;
+
+                if (!instruction_info.preserve_stack)
+                {
+                    // Now we patch the call if required. For normal calls that 'return' (i.e calls to C/C++ ABI), we do not patch them as they will manage the stack themselves (callee-managed)
+                    llvm::Instruction* original_inst = llvm::dyn_cast<llvm::Instruction>(bit);
+                    irb->SetInsertPoint(ensure(llvm::dyn_cast<llvm::Instruction>(bit)));
+
+                    if (function_info.stack_frame_size > 0)
+                    {
+                        // 1. Nuke all scratch
+                        LLVM_ASM_0(frame_epilogue, irb, f.getContext());
+                    }
+
+                    if (function_info.clobbers_x30)
+                    {
+                        // 2. Restore the gateway as the current return address
+                        LLVM_ASM_0(x30_tail_restore, irb, f.getContext());
+                    }
+
+                    // 3. We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only.
+                    // Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting.
+                    // This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated.
+                    // No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away.
+
+                    std::string exit_fn;
+                    auto ci = ensure(llvm::dyn_cast<llvm::CallInst>(original_inst));
+                    auto operand_count = ci->getNumOperands();
+                    std::vector<std::string> constraints;
+                    std::vector<llvm::Value*> args;
+
+                    // We now load the callee args.
+                    // FIXME: This is often times redundant and wastes cycles, we'll clean this up in a MachineFunction pass later.
+                    int base_reg = execution_context.base_register;
+                    for (unsigned i = 0; i < operand_count; ++i)
+                    {
+                        args.push_back(ci->getOperand(i));
+                        exit_fn += fmt::format("mov x%d, $%u;\n", base_reg++, i);
+                        constraints.push_back("r");
+                    }
+
+                    std::copy(ci->operands().begin(), ci->operands().end(), args.begin());
+                    auto target = ensure(ci->getCalledOperand());
+                    args.push_back(target);
+
+                    if (ci->isIndirectCall())
+                    {
+                        constraints.push_back("r");
+                        exit_fn += fmt::format(
+                            "mov x15, $%u;\n"
+                            "br x15",
+                            operand_count);
+                    }
+                    else
+                    {
+                        constraints.push_back("i");
+                        exit_fn += fmt::format("b $%u;\n", operand_count);
+                    }
+
+                    // Emit the branch
+                    LLVM_ASM(exit_fn, args, join_strings(constraints, ","), irb, f.getContext());
+
+                    // Delete original call instruction
+                    bit = ci->eraseFromParent();
+                }
+
+                // Next
+                if (bit != bb.end())
+                {
+                    ++bit;
+                }
+            }
+        }
+
+        ensure(terminator_found, "Could not find terminator for function!");
+    }
+}
diff --git a/rpcs3/Emu/CPU/Backends/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64JIT.h
new file mode 100644
index 0000000000..77ec184184
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64JIT.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#ifndef ARCH_ARM64
+#error "You have included an arm-only header"
+#endif
+
+#include <util/types.hpp>
+#include "../CPUTranslator.h"
+
+#include <unordered_set>
+
+namespace aarch64
+{
+    enum gprs : s32
+    {
+        x0 = 0,
+        x1, x2, x3, x4, x5, x6, x7, x8, x9,
+        x10, x11, x12, x13, x14, x15, x16, x17, x18, x19,
+        x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30
+    };
+
+    // On non-x86 architectures GHC runs stackless. SP is treated as a pointer to scratchpad memory.
+    // This pass keeps this behavior intact while preserving the expectations of the host's C++ ABI.
+    class GHC_frame_preservation_pass : translator_pass
+    {
+    public:
+        struct function_info_t
+        {
+            u32 instruction_count;
+            u32 num_external_calls;
+            u32 stack_frame_size;     // Guessing this properly is critical for vector-heavy functions where spilling is a lot more common
+            bool clobbers_x30;
+        };
+
+        struct instruction_info_t
+        {
+            bool is_call_inst;        // Is a function call. This includes a branch to external code.
+            bool preserve_stack;      // Preserve the stack around this call.
+            bool is_returning;        // This instruction "returns" to the next instruction (typically just llvm::CallInst*)
+            bool callee_is_GHC;       // The other function is GHC
+            bool is_tail_call;        // Tail call. Assume it is an exit/terminator.
+            llvm::Function* callee;   // Callee if any
+        };
+    protected:
+        std::unordered_set<std::string> visited_functions;
+
+        struct
+        {
+            gprs base_register;
+            u32  hypervisor_context_offset;
+        } execution_context;
+
+        std::function<bool(const std::string&)> exclusion_callback;
+
+        void force_tail_call_terminators(llvm::Function& f);
+
+        function_info_t preprocess_function(llvm::Function& f);
+
+        instruction_info_t decode_instruction(llvm::Function& f, llvm::Instruction* i);
+    public:
+
+        GHC_frame_preservation_pass(
+            gprs base_reg,
+            u32 hv_ctx_offset,
+            std::function<bool(const std::string&)> exclusion_callback = {});
+        ~GHC_frame_preservation_pass() = default;
+
+        void run(llvm::IRBuilder<>* irb, llvm::Function& f) override;
+        void reset() override;
+    };
+}
diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp
index 28bc0fc3e5..8d101fca11 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.cpp
+++ b/rpcs3/Emu/CPU/CPUTranslator.cpp
@@ -392,6 +392,17 @@ void cpu_translator::replace_intrinsics(llvm::Function& f)
 	}
 }
 
+void cpu_translator::run_transforms(llvm::Function& f)
+{
+	// This pass must run first because the other passes may depend on resolved names.
+	replace_intrinsics(f);
+
+	for (auto& pass : m_transform_passes)
+	{
+		pass->run(m_ir, f);
+	}
+}
+
 void cpu_translator::erase_stores(llvm::ArrayRef<llvm::Value*> args)
 {
 	for (auto v : args)
diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
index 2cefe69a32..a2aa9f7213 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -3033,6 +3033,16 @@ struct llvm_calli
 	}
 };
 
+class translator_pass
+{
+public:
+	translator_pass() = default;
+	virtual ~translator_pass() {}
+
+	virtual void run(llvm::IRBuilder<>* irb, llvm::Function& func) = 0;
+	virtual void reset() = 0;
+};
+
 class cpu_translator
 {
 protected:
@@ -3074,9 +3084,18 @@ protected:
 	// IR builder
 	llvm::IRBuilder<>* m_ir = nullptr;
 
+	// CUstomized transformation passes. Technically the intrinsics replacement belongs here.
+	std::vector<std::unique_ptr<translator_pass>> m_transform_passes;
+
 	void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine);
 
 public:
+	// Register a transformation pass to be run before final compilation by llvm
+	void register_transform_pass(std::unique_ptr<translator_pass>& pass)
+	{
+		m_transform_passes.emplace_back(std::move(pass));
+	}
+
 	// Convert a C++ type to an LLVM type (TODO: remove)
 	template <typename T>
 	llvm::Type* GetType()
@@ -3778,9 +3797,12 @@ public:
 		}
 	}
 
-	// Finalize processing custom intrinsics
+	// Run intrinsics replacement pass
 	void replace_intrinsics(llvm::Function&);
 
+	// Finalize processing
+	void run_transforms(llvm::Function&);
+
 	// Erase store instructions of provided
 	void erase_stores(llvm::ArrayRef<llvm::Value*> args);
 
diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp
index 521077ac1c..165955fe88 100644
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@@ -269,7 +269,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 		}
 	}
 
-	replace_intrinsics(*m_function);
+	run_transforms(*m_function);
 	return m_function;
 }
 
@@ -321,7 +321,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
 	{
 		// Possible special case for no functions (allowing the do-while optimization)
 		m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path.
-		replace_intrinsics(*m_function);
+		run_transforms(*m_function);
 		return m_function;
 	}
 
@@ -379,7 +379,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
 
 	m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here
 
-	replace_intrinsics(*m_function);
+	run_transforms(*m_function);
 	return m_function;
 }
 
@@ -5375,7 +5375,7 @@ void PPUTranslator::build_interpreter()
 		op.vc = 3; \
 		this->i(op); \
 		VMEscape(); \
-		replace_intrinsics(*m_function); \
+		run_transforms(*m_function); \
 	}
 
 	BUILD_VEC_INST(VADDCUW);
diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
index 4c704d5e2b..574d4acb93 100644
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@@ -2605,7 +2605,7 @@ public:
 
 		for (auto& f : *m_module)
 		{
-			replace_intrinsics(f);
+			run_transforms(f);
 		}
 
 		for (const auto& func : m_functions)
@@ -3089,7 +3089,7 @@ public:
 
 		for (auto& f : *_module)
 		{
-			replace_intrinsics(f);
+			run_transforms(f);
 		}
 
 		std::string log;