SPU ASMJIT v2.0

Use X86Assembler and blocks
2025-04-20 19:45:20 +00:00 · 2018-04-09 17:45:37 +03:00 · 2018-04-09 17:45:37 +03:00 · 8ca33bcb94
commit 8ca33bcb94
parent 477522210e
12 changed files with 1363 additions and 1187 deletions
--- a/Utilities/sysinfo.cpp
+++ b/Utilities/sysinfo.cpp
@ -13,6 +13,12 @@ bool utils::has_ssse3()
 	return g_value;
 }

+bool utils::has_sse41()
+{
+	static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x80000;
+	return g_value;
+}
+
 bool utils::has_avx()
 {
 	static const bool g_value = get_cpuid(0, 0)[0] >= 0x1 && get_cpuid(1, 0)[2] & 0x10000000 && (get_cpuid(1, 0)[2] & 0x0C000000) == 0x0C000000 && (get_xgetbv(0) & 0x6) == 0x6;
--- a/Utilities/sysinfo.h
+++ b/Utilities/sysinfo.h
@ -29,6 +29,8 @@ namespace utils

 	bool has_ssse3();

+	bool has_sse41();
+
 	bool has_avx();

 	bool has_avx2();
--- a/rpcs3/Emu/CPU/CPUDisAsm.h
+++ b/rpcs3/Emu/CPU/CPUDisAsm.h
@ -42,10 +42,10 @@ protected:
 public:
 	std::string last_opcode;
 	u32 dump_pc;
-	u8* offset;
+	const u8* offset;

 protected:
-	CPUDisAsm(CPUDisAsmMode mode) 
+	CPUDisAsm(CPUDisAsmMode mode)
 		: m_mode(mode)
 		, offset(0)
 	{
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -37,7 +37,7 @@ void RawSPUThread::on_init(const std::shared_ptr<void>& _this)
 }

 RawSPUThread::RawSPUThread(const std::string& name)
-	: SPUThread(name)
+	: SPUThread(name, 0, nullptr)
 {
 }

--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -1,56 +1,73 @@
 #pragma once

+#include "Utilities/mutex.h"
 #include "SPURecompiler.h"

-namespace asmjit
+#include <functional>
+
+#define ASMJIT_STATIC
+#define ASMJIT_DEBUG
+
+#include "asmjit.h"
+
+// SPU ASMJIT Runtime object (global)
+class spu_runtime
 {
-	struct JitRuntime;
-	struct CodeHolder;
-	struct X86Compiler;
-	struct X86Gp;
-	struct X86Xmm;
-	struct X86Mem;
-	struct Label;
-}
+	shared_mutex m_mutex;
+
+	asmjit::JitRuntime m_jitrt;
+
+	// All functions
+	std::map<std::vector<u32>, spu_function_t> m_map;
+
+	// TODO
+	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
+
+	friend class spu_recompiler;
+
+public:
+	spu_runtime();
+};

 // SPU ASMJIT Recompiler
 class spu_recompiler : public spu_recompiler_base
 {
-	const std::shared_ptr<asmjit::JitRuntime> m_jit;
+	const std::shared_ptr<asmjit::JitRuntime> m_rt;
+
+	std::shared_ptr<spu_runtime> m_spurt;

 public:
-	spu_recompiler();
+	spu_recompiler(class SPUThread& spu);

-	virtual void compile(spu_function_t& f) override;
+	virtual spu_function_t compile(const std::vector<u32>& func) override;

 private:
 	// emitter:
-	asmjit::X86Compiler* c;
-	asmjit::CodeHolder* codeHolder;
+	asmjit::X86Assembler* c;

-	// input:
-	asmjit::X86Gp* cpu;
-	asmjit::X86Gp* ls;
+	// arguments:
+	const asmjit::X86Gp* cpu;
+	const asmjit::X86Gp* ls;
+	const asmjit::X86Gp* qw0;
+	const asmjit::X86Gp* qw1;

 	// temporary:
-	asmjit::X86Gp* addr;
-	asmjit::X86Gp* qw0;
-	asmjit::X86Gp* qw1;
-	asmjit::X86Gp* qw2;
-	asmjit::X86Gp* qw3;
-	std::array<asmjit::X86Xmm*, 6> vec;
+	const asmjit::X86Gp* addr;
+	std::array<const asmjit::X86Xmm*, 6> vec;

-	// labels:
-	asmjit::Label* labels; // array[0x10000]
-	asmjit::Label* jt; // jump table resolver (uses *addr)
-	asmjit::Label* end; // function end (return *addr)
+	// workload for the end of function:
+	std::vector<std::function<void()>> after;
+	std::vector<std::function<void()>> consts;
+
+	// All emitted 128-bit consts
+	std::map<std::pair<u64, u64>, asmjit::Label> xmm_consts;

 	class XmmLink
 	{
-		asmjit::X86Xmm* m_var;
+		const asmjit::X86Xmm* m_var;

 	public:
-		XmmLink(asmjit::X86Xmm*& xmm_var)
+		XmmLink(const asmjit::X86Xmm*& xmm_var)
 			: m_var(xmm_var)
 		{
 			xmm_var = nullptr;
@ -58,7 +75,7 @@ private:

 		XmmLink(XmmLink&&) = default; // MoveConstructible + delete copy constructor and copy/move operators

-		operator asmjit::X86Xmm&() const
+		operator const asmjit::X86Xmm&() const
 		{
 			return *m_var;
 		}
@ -78,10 +95,15 @@ private:
 	asmjit::X86Mem XmmConst(__m128 data);
 	asmjit::X86Mem XmmConst(__m128i data);

+	void branch_fixed(u32 target);
+	void branch_indirect(spu_opcode_t op);
+	asmjit::Label halt(u32 pos);
+	void fall(spu_opcode_t op);
+	void save_rcx();
+	void load_rcx();
+
 public:
-	void CheckInterruptStatus(spu_opcode_t op);
-	void InterpreterCall(spu_opcode_t op);
-	void FunctionCall();
+	void UNK(spu_opcode_t op);

 	void STOP(spu_opcode_t op);
 	void LNOP(spu_opcode_t op);
@ -282,6 +304,4 @@ public:
 	void FNMS(spu_opcode_t op);
 	void FMA(spu_opcode_t op);
 	void FMS(spu_opcode_t op);
-
-	void UNK(spu_opcode_t op);
 };
--- a/rpcs3/Emu/Cell/SPUAnalyser.cpp
+++ b/rpcs3/Emu/Cell/SPUAnalyser.cpp
@ -1,39 +1,12 @@
 #include "stdafx.h"
+#include "Emu/Memory/vm.h"
 #include "SPUAnalyser.h"
 #include "SPURecompiler.h"
 #include "SPUOpcodes.h"

 const spu_decoder<spu_itype> s_spu_itype;

-spu_function_t* SPUDatabase::find(const be_t<u32>* data, u64 key, u32 max_size)
-{
-	for (auto found = m_db.equal_range(key); found.first != found.second; found.first++)
-	{
-		const auto& func = found.first->second;
-
-		// Compare binary data explicitly (TODO: optimize)
-		if (LIKELY(func->size <= max_size) && std::memcmp(func->data.data(), data, func->size) == 0)
-		{
-			return func.get();
-		}
-	}
-
-	return nullptr;
-}
-
-SPUDatabase::SPUDatabase()
-{
-	// TODO: load existing database associated with currently running executable
-
-	LOG_SUCCESS(SPU, "SPU Database initialized...");
-}
-
-SPUDatabase::~SPUDatabase()
-{
-	// TODO: serialize database
-}
-
-spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_limit)
+std::shared_ptr<spu_function> spu_analyse(const be_t<u32>* ls, u32 entry, u32 max_limit)
 {
 	// Check arguments (bounds and alignment)
 	if (max_limit > 0x40000 || entry >= max_limit || entry % 4 || max_limit % 4)
@ -47,23 +20,23 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 	const u32 block_sz = max_limit - entry;

 	{
-		reader_lock lock(m_mutex);
+		//reader_lock lock(m_mutex);

 		// Try to find existing function in the database
-		if (auto func = find(base, key, block_sz))
-		{
-			return func;
-		}
+		// if (auto func = find(base, key, block_sz))
+		// {
+		// 	return func;
+		// }
 	}

 	{
-		writer_lock lock(m_mutex);
+		//writer_lock lock(m_mutex);

 		// Double-check
-		if (auto func = find(base, key, block_sz))
-		{
-			return func;
-		}
+		// if (auto func = find(base, key, block_sz))
+		// {
+		// 	return func;
+		// }
 	}

 	// Initialize block entries with the function entry point
@ -89,14 +62,14 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 		const auto type = s_spu_itype.decode(op.opcode);

 		{
-			reader_lock lock(m_mutex);
+			//reader_lock lock(m_mutex);

 			// Find existing function
-			if (pos != entry && find(ls + pos / 4, pos | u64{ op.opcode } << 32, limit - pos))
-			{
-				limit = pos;
-				break;
-			}
+			// if (pos != entry && find(ls + pos / 4, pos | u64{ op.opcode } << 32, limit - pos))
+			// {
+			// 	limit = pos;
+			// 	break;
+			// }
 		}

 		// Additional analysis at the beginning of the block
@ -156,7 +129,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim

 				// Fix pos value
 				start = pos; pos = pos - 4;
-				
+
 				continue;
 			}

@ -179,10 +152,10 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim

 		// if upcoming instruction is not BI, reset the pigeonhole optimization
 		// todo: can constant propogation somewhere get rid of this check?
-		if ((type != BI))
+		if ((type != spu_itype::BI))
 			ila_r2_addr = 0; // reset
-		
-		if (type == BI || type == IRET) // Branch Indirect
+
+		if (type == spu_itype::BI || type == spu_itype::IRET) // Branch Indirect
 		{
 			blocks.emplace(start);
 			start = pos + 4;
@ -190,9 +163,9 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 			if (op.ra == 2 && ila_r2_addr > entry)
 				blocks.emplace(ila_r2_addr);
 		}
-		else if (type == BR || type == BRA) // Branch Relative/Absolute
+		else if (type == spu_itype::BR || type == spu_itype::BRA) // Branch Relative/Absolute
 		{
-			const u32 target = spu_branch_target(type == BR ? pos : 0, op.i16);
+			const u32 target = spu_branch_target(type == spu_itype::BR ? pos : 0, op.i16);

 			// Add adjacent function because it always could be
 			adjacent.emplace(target);
@ -205,9 +178,9 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 			blocks.emplace(start);
 			start = pos + 4;
 		}
-		else if (type == BRSL || type == BRASL) // Branch Relative/Absolute and Set Link
+		else if (type == spu_itype::BRSL || type == spu_itype::BRASL) // Branch Relative/Absolute and Set Link
 		{
-			const u32 target = spu_branch_target(type == BRSL ? pos : 0, op.i16);
+			const u32 target = spu_branch_target(type == spu_itype::BRSL ? pos : 0, op.i16);

 			if (target == pos + 4)
 			{
@ -228,11 +201,11 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 				if (op.rt != 0) LOG_ERROR(SPU, "[0x%05x] Function call without $LR", pos);
 			}
 		}
-		else if (type == BISL || type == BISLED) // Branch Indirect and Set Link
+		else if (type == spu_itype::BISL || type == spu_itype::BISLED) // Branch Indirect and Set Link
 		{
 			if (op.rt != 0) LOG_ERROR(SPU, "[0x%05x] Indirect function call without $LR", pos);
 		}
-		else if (type == BRNZ || type == BRZ || type == BRHNZ || type == BRHZ) // Branch Relative if (Not) Zero (Half)word
+		else if (type == spu_itype::BRNZ || type == spu_itype::BRZ || type == spu_itype::BRHNZ || type == spu_itype::BRHZ) // Branch Relative if (Not) Zero (Half)word
 		{
 			const u32 target = spu_branch_target(pos, op.i16);

@ -244,7 +217,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 				blocks.emplace(target);
 			}
 		}
-		else if (type == LNOP || type == NOP) {
+		else if (type == spu_itype::LNOP || type == spu_itype::NOP) {
 			// theres a chance that theres some random lnops/nops after the end of a function
 			// havent found a definite pattern, but, is an easy optimization to check for, just push start down if lnop is tagged as a start
 			// todo: remove the last added start pos as its probly unnecessary
@ -262,7 +235,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 			// Analyse stack pointer access
 			else if (rt == 1)
 			{
-				if (type == ILA && pos < ila_sp_pos)
+				if (type == spu_itype::ILA && pos < ila_sp_pos)
 				{
 					// set minimal ila $SP,* instruction position
 					ila_sp_pos = pos;
@ -272,7 +245,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 			// ila r2, addr
 			// bi r2
 			else if (rt == 2) {
-				if (type == ILA)
+				if (type == spu_itype::ILA)
 					ila_r2_addr = spu_branch_target(op.i18);
 			}
 		}
@ -285,9 +258,9 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim

 		const auto type = s_spu_itype.decode(op.opcode);

-		if (type == BRSL || type == BRASL) // Branch Relative/Absolute and Set Link
+		if (type == spu_itype::BRSL || type == spu_itype::BRASL) // Branch Relative/Absolute and Set Link
 		{
-			const u32 target = spu_branch_target(type == BRSL ? pos : 0, op.i16);
+			const u32 target = spu_branch_target(type == spu_itype::BRSL ? pos : 0, op.i16);

 			if (target != pos + 4 && target > entry && limit > target)
 			{
@ -308,7 +281,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 	}

 	// Prepare new function (set addr and size)
-	auto func = std::make_shared<spu_function_t>(entry, limit - entry);
+	auto func = std::make_shared<spu_function>(entry, limit - entry);

 	// Copy function contents
 	func->data = { ls + entry / 4, ls + limit / 4 };
@ -346,13 +319,13 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
 	// Lock here just before we write to the db
 	// Its is unlikely that the second check will pass anyway so we delay this step since compiling functions is very fast
 	{
-		writer_lock lock(m_mutex);
+		//writer_lock lock(m_mutex);

 		// Add function to the database
-		m_db.emplace(key, func);
+		//m_db.emplace(key, func);
 	}

 	LOG_NOTICE(SPU, "Function detected [0x%05x-0x%05x] (size=0x%x)", func->addr, func->addr + func->size, func->size);

-	return func.get();
+	return func;
 }
--- a/rpcs3/Emu/Cell/SPUAnalyser.h
+++ b/rpcs3/Emu/Cell/SPUAnalyser.h
@ -1,7 +1,6 @@
 #pragma once

-#include "Utilities/mutex.h"
-
+#include <vector>
 #include <set>

 // SPU Instruction Type
@ -247,7 +246,7 @@ struct spu_itype
 class SPUThread;

 // SPU basic function information structure
-struct spu_function_t
+struct spu_function
 {
 	// Entry point (LS address)
 	const u32 addr;
@ -273,28 +272,9 @@ struct spu_function_t
 	// Pointer to the compiled function
 	u32(*compiled)(SPUThread* _spu, be_t<u32>* _ls) = nullptr;

-	spu_function_t(u32 addr, u32 size)
+	spu_function(u32 addr, u32 size)
 		: addr(addr)
 		, size(size)
 	{
 	}
 };
-
-// SPU Function Database (must be global or PS3 process-local)
-class SPUDatabase final : spu_itype
-{
-	shared_mutex m_mutex;
-
-	// All registered functions (uses addr and first instruction as a key)
-	std::unordered_multimap<u64, std::shared_ptr<spu_function_t>> m_db;
-
-	// For internal use
-	spu_function_t* find(const be_t<u32>* data, u64 key, u32 max_size);
-
-public:
-	SPUDatabase();
-	~SPUDatabase();
-
-	// Try to retrieve SPU function information
-	spu_function_t* analyse(const be_t<u32>* ls, u32 entry, u32 limit = 0x40000);
-};
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -3,92 +3,183 @@
 #include "Emu/Memory/Memory.h"

 #include "SPUThread.h"
+#include "SPUAnalyser.h"
 #include "SPURecompiler.h"
-#include "SPUASMJITRecompiler.h"
 #include <algorithm>

 extern u64 get_system_time();

+const spu_decoder<spu_itype> s_spu_itype;
+
+spu_recompiler_base::spu_recompiler_base(SPUThread& spu)
+	: m_spu(spu)
+{
+	// Initialize lookup table
+	spu.jit_dispatcher.fill(&dispatch);
+
+	// Initialize "empty" block
+	spu.jit_map[std::vector<u32>()] = &dispatch;
+}
+
 spu_recompiler_base::~spu_recompiler_base()
 {
 }

-void spu_recompiler_base::enter(SPUThread& spu)
+void spu_recompiler_base::dispatch(SPUThread& spu, void*, u8* rip)
 {
-	if (spu.pc >= 0x40000 || spu.pc % 4)
+	const auto result = spu.jit_map.emplace(block(spu, spu.pc), nullptr);
+
+	if (result.second || !result.first->second)
 	{
-		fmt::throw_exception("Invalid PC: 0x%05x", spu.pc);
+		result.first->second = spu.jit->compile(result.first->first);
 	}

-	// Get SPU LS pointer
-	const auto _ls = vm::_ptr<u32>(spu.offset);
+	spu.jit_dispatcher[spu.pc / 4] = result.first->second;
+}

-	// Search if cached data matches
-	auto func = spu.compiled_cache[spu.pc / 4];
+void spu_recompiler_base::branch(SPUThread& spu, std::pair<const std::vector<u32>, spu_function_t>* pair, u8* rip)
+{
+	spu.pc = pair->first[0];

-	// Check shared db if we dont have a match
-	if (!func || !std::equal(func->data.begin(), func->data.end(), _ls + spu.pc / 4, [](const be_t<u32>& l, const be_t<u32>& r) { return *(u32*)(u8*)&l == *(u32*)(u8*)&r; }))
+	if (!pair->second)
 	{
-		func = spu.spu_db->analyse(_ls, spu.pc);
-		spu.compiled_cache[spu.pc / 4] = func;
+		pair->second = spu.jit->compile(pair->first);
 	}

-	// Reset callstack if necessary
-	if ((func->does_reset_stack && spu.recursion_level) || spu.recursion_level >= 128)
-	{
-		spu.state += cpu_flag::ret;
-		return;
-	}
+	spu.jit_dispatcher[spu.pc / 4] = pair->second;

-	// Compile if needed
-	if (!func->compiled)
+	// Overwrite jump to this function with jump to the compiled function
+	const s64 rel = reinterpret_cast<u64>(pair->second) - reinterpret_cast<u64>(rip) - 5;
+
+	if (rel >= INT32_MIN && rel <= INT32_MAX)
 	{
-		if (!spu.spu_rec)
+		const s64 rel8 = (rel + 5) - 2;
+
+		alignas(8) u8 bytes[8];
+
+		if (rel8 >= INT8_MIN && rel8 <= INT8_MAX)
 		{
-			spu.spu_rec = fxm::get_always<spu_recompiler>();
+			bytes[0] = 0xeb; // jmp rel8
+			bytes[1] = static_cast<s8>(rel8);
+			std::memset(bytes + 2, 0x90, 5);
+			bytes[7] = 0x48;
+		}
+		else
+		{
+			bytes[0] = 0xe9; // jmp rel32
+			std::memcpy(bytes + 1, &rel, 4);
+			std::memset(bytes + 5, 0x90, 2);
+			bytes[7] = 0x48;
 		}

-		spu.spu_rec->compile(*func);
-
-		if (!func->compiled) fmt::throw_exception("Compilation failed" HERE);
+#ifdef _MSC_VER
+		*(volatile u64*)(rip) = *reinterpret_cast<u64*>(+bytes);
+#else
+		__atomic_store_n(reinterpret_cast<u64*>(rip), *reinterpret_cast<u64*>(+bytes), __ATOMIC_RELAXED);
+#endif
 	}
-
-	const u32 res = func->compiled(&spu, _ls);
-
-	if (const auto exception = spu.pending_exception)
+	else
 	{
-		spu.pending_exception = nullptr;
-		std::rethrow_exception(exception);
-	}
+		alignas(16) u8 bytes[16];

-	if (res & 0x1000000)
-	{
-		spu.halt();
-	}
+		bytes[0] = 0xff; // jmp [rip+2]
+		bytes[1] = 0x25;
+		bytes[2] = 0x02;
+		bytes[3] = 0x00;
+		bytes[4] = 0x00;
+		bytes[5] = 0x00;
+		bytes[6] = 0x48; // mov rax, imm64 (not executed)
+		bytes[7] = 0xb8;
+		std::memcpy(bytes + 8, &pair->second, 8);

-	if (res & 0x2000000)
-	{
-	}
-
-	if (res & 0x4000000)
-	{
-		if (res & 0x8000000)
-		{
-			fmt::throw_exception("Invalid interrupt status set (0x%x)" HERE, res);
-		}
-
-		spu.set_interrupt_status(true);
-	}
-	else if (res & 0x8000000)
-	{
-		spu.set_interrupt_status(false);
-	}
-
-	spu.pc = res & 0x3fffc;
-
-	if (spu.interrupts_enabled && (spu.ch_event_mask & spu.ch_event_stat & SPU_EVENT_INTR_IMPLEMENTED) > 0)
-	{
-		spu.interrupts_enabled = false;
-		spu.srr0 = std::exchange(spu.pc, 0);
+		reinterpret_cast<atomic_t<u128>*>(rip)->store(*reinterpret_cast<u128*>(+bytes));
 	}
 }
+
+std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa)
+{
+	u32 addr = lsa;
+
+	std::vector<u32> result;
+
+	while (addr < 0x40000)
+	{
+		const u32 data = spu._ref<u32>(addr);
+
+		if (data == 0 && addr == lsa)
+		{
+			break;
+		}
+
+		addr += 4;
+
+		if (result.empty())
+		{
+			result.emplace_back(lsa);
+		}
+
+		result.emplace_back(se_storage<u32>::swap(data));
+
+		const auto type = s_spu_itype.decode(data);
+
+		switch (type)
+		{
+		case spu_itype::UNK:
+		case spu_itype::STOP:
+		case spu_itype::STOPD:
+		case spu_itype::SYNC:
+		case spu_itype::DSYNC:
+		case spu_itype::DFCEQ:
+		case spu_itype::DFCMEQ:
+		case spu_itype::DFCGT:
+		//case spu_itype::DFCMGT:
+		case spu_itype::DFTSV:
+		case spu_itype::BI:
+		case spu_itype::IRET:
+		case spu_itype::BISL:
+		{
+			break;
+		}
+		case spu_itype::BRA:
+		case spu_itype::BRASL:
+		{
+			if (spu_branch_target(0, spu_opcode_t{data}.i16) == addr)
+			{
+				continue;
+			}
+
+			break;
+		}
+		case spu_itype::BR:
+		case spu_itype::BRSL:
+		{
+			if (spu_branch_target(addr - 4, spu_opcode_t{data}.i16) == addr)
+			{
+				continue;
+			}
+
+			break;
+		}
+		case spu_itype::BRZ:
+		case spu_itype::BRNZ:
+		case spu_itype::BRHZ:
+		case spu_itype::BRHNZ:
+		{
+			if (spu_branch_target(addr - 4, spu_opcode_t{data}.i16) >= addr)
+			{
+				continue;
+			}
+
+			break;
+		}
+		default:
+		{
+			continue;
+		}
+		}
+
+		break;
+	}
+
+	return result;
+}
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -1,25 +1,32 @@
 #pragma once

-#include "SPUAnalyser.h"
+#include "SPUThread.h"

-#include <mutex>
-
-// SPU Recompiler instance base (must be global or PS3 process-local)
+// SPU Recompiler instance base class
 class spu_recompiler_base
 {
 protected:
-	std::mutex m_mutex; // must be locked in compile()
+	SPUThread& m_spu;

-	const spu_function_t* m_func; // current function
-
-	u32 m_pos; // current position
+	u32 m_pos;

 public:
+	spu_recompiler_base(SPUThread& spu);
+
 	virtual ~spu_recompiler_base();

-	// Compile specified function
-	virtual void compile(spu_function_t& f) = 0;
+	// Compile function
+	virtual spu_function_t compile(const std::vector<u32>& func) = 0;

-	// Run
-	static void enter(class SPUThread&);
+	// Default dispatch function fallback (second pointer is unused)
+	static void dispatch(SPUThread&, void*, u8*);
+
+	// Direct branch fallback for non-compiled destination
+	static void branch(SPUThread&, std::pair<const std::vector<u32>, spu_function_t>*, u8* rip);
+
+	// Get the block at specified address
+	static std::vector<u32> block(SPUThread&, u32 lsa);
+
+	// Create recompiler instance (ASMJIT)
+	static std::unique_ptr<spu_recompiler_base> make_asmjit_recompiler(SPUThread& spu);
 };
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -314,7 +314,7 @@ std::string SPUThread::dump() const

 	// Print some transaction statistics
 	fmt::append(ret, "\nTX: %u; Fail: %u (0x%x)", tx_success, tx_failure, tx_status);
-	fmt::append(ret, "\nRaddr: 0x%08x; R: 0x%x", raddr, raddr ? +vm::reservation_acquire(raddr, 128) : 0);
+	fmt::append(ret, "\nBlocks: %u; Fail: %u", block_counter, block_failure);
 	fmt::append(ret, "\nTag Mask: 0x%08x", ch_tag_mask);
 	fmt::append(ret, "\nMFC Stall: 0x%08x", ch_stall_mask);
 	fmt::append(ret, "\nMFC Queue Size: %u", mfc_size);
@ -397,12 +397,6 @@ void SPUThread::cpu_task()
 {
 	std::fesetround(FE_TOWARDZERO);

-	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
-	{
-		if (!spu_db) spu_db = fxm::get_always<SPUDatabase>();
-		return spu_recompiler_base::enter(*this);
-	}
-
 	g_tls_log_prefix = []
 	{
 		const auto cpu = static_cast<SPUThread*>(get_current_cpu_thread());
@ -410,6 +404,16 @@ void SPUThread::cpu_task()
 		return fmt::format("%s [0x%05x]", cpu->get_name(), cpu->pc);
 	};

+	if (jit)
+	{
+		while (LIKELY(!test(state) || !check_state()))
+		{
+			jit_dispatcher[pc / 4](*this, vm::_ptr<u8>(offset), nullptr);
+		}
+
+		return;
+	}
+
 	// Select opcode table
 	const auto& table = *(
 		g_cfg.core.spu_decoder == spu_decoder_type::precise ? &g_spu_interpreter_precise.get_table() :
@ -502,15 +506,6 @@ SPUThread::~SPUThread()
 	vm::dealloc_verbose_nothrow(offset);
 }

-SPUThread::SPUThread(const std::string& name)
-	: cpu_thread(idm::last_id())
-	, m_name(name)
-	, index(0)
-	, offset(0)
-	, group(nullptr)
-{
-}
-
 SPUThread::SPUThread(const std::string& name, u32 index, lv2_spu_group* group)
 	: cpu_thread(idm::last_id())
 	, m_name(name)
@ -518,6 +513,14 @@ SPUThread::SPUThread(const std::string& name, u32 index, lv2_spu_group* group)
 	, offset(0)
 	, group(group)
 {
+	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
+	{
+		jit = spu_recompiler_base::make_asmjit_recompiler(*this);
+	}
+
+	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	{
+	}
 }

 void SPUThread::push_snr(u32 number, u32 value)
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -5,10 +5,17 @@
 #include "Emu/Cell/SPUInterpreter.h"
 #include "MFC.h"

+#include <map>
+
 struct lv2_event_queue;
 struct lv2_spu_group;
 struct lv2_int_tag;

+class SPUThread;
+
+// JIT Block
+using spu_function_t = void(*)(SPUThread&, void*, u8*);
+
 // SPU Channels
 enum : u32
 {
@ -514,16 +521,14 @@ public:
 	virtual ~SPUThread() override;
 	void cpu_init();

-protected:
-	SPUThread(const std::string& name);
-
-public:
 	static const u32 id_base = 0x02000000; // TODO (used to determine thread type)
 	static const u32 id_step = 1;
 	static const u32 id_count = 2048;

 	SPUThread(const std::string& name, u32 index, lv2_spu_group* group);

+	u32 pc = 0;
+
 	// General-Purpose Registers
 	std::array<v128, 128> gpr;
 	SPU_FPSCR fpscr;
@ -577,24 +582,26 @@ public:
 	std::array<std::pair<u32, std::weak_ptr<lv2_event_queue>>, 32> spuq; // Event Queue Keys for SPU Thread
 	std::weak_ptr<lv2_event_queue> spup[64]; // SPU Ports

-	u32 pc = 0; //
 	const u32 index; // SPU index
 	const u32 offset; // SPU LS offset
 	lv2_spu_group* const group; // SPU Thread Group

 	const std::string m_name; // Thread name

-	std::exception_ptr pending_exception;
-
-	std::array<struct spu_function_t*, 65536> compiled_cache{};
-	std::shared_ptr<class SPUDatabase> spu_db;
-	std::shared_ptr<class spu_recompiler_base> spu_rec;
-	u32 recursion_level = 0;
-
 	u64 tx_success = 0;
 	u64 tx_failure = 0;
 	uint tx_status = 0;

+	std::unique_ptr<class spu_recompiler_base> jit; // Recompiler instance
+
+	std::map<std::vector<u32>, spu_function_t> jit_map; // All compiled blocks (first u32 is addr)
+
+	u64 block_counter = 0;
+	u64 block_recover = 0;
+	u64 block_failure = 0;
+
+	std::array<spu_function_t, 0x10000> jit_dispatcher; // Dispatch table for indirect calls
+
 	void push_snr(u32 number, u32 value);
 	void do_dma_transfer(const spu_mfc_cmd& args);
 	bool do_dma_check(const spu_mfc_cmd& args);