SPU ASMJIT: übertrampolines and spu_runtime

Use opt-out shared spu_runtime to save memory (Option: SPU Shared Runtime) Implement "übertrampolines" for dispatching compiled blocks Patch fixed branch points to use trampolines after check failure
2025-04-20 19:45:20 +00:00 · 2018-04-16 18:27:57 +03:00 · 2018-04-16 18:27:57 +03:00 · 3ffafb741c
commit 3ffafb741c
parent 8ca33bcb94
7 changed files with 329 additions and 61 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -31,12 +31,6 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler

 spu_runtime::spu_runtime()
 {
-	if (g_cfg.core.spu_debug)
-	{
-		fs::file log(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite);
-		log.write(fmt::format("SPU JIT Log...\n\nTitle: %s\nTitle ID: %s\n\n", Emu.GetTitle().c_str(), Emu.GetTitleID().c_str()));
-	}
-
 	LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");

 	// Initialize lookup table
@ -51,8 +45,23 @@ spu_runtime::spu_runtime()

 spu_recompiler::spu_recompiler(SPUThread& spu)
 	: spu_recompiler_base(spu)
-	, m_rt(std::make_shared<asmjit::JitRuntime>())
 {
+	if (!g_cfg.core.spu_shared_runtime)
+	{
+		m_spurt = std::make_shared<spu_runtime>();
+	}
+}
+
+spu_function_t spu_recompiler::get(u32 lsa)
+{
+	// Initialize if necessary
+	if (!m_spurt)
+	{
+		m_spurt = fxm::get_always<spu_runtime>();
+	}
+
+	// Simple atomic read
+	return m_spurt->m_dispatcher[lsa / 4];
 }

 spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
@ -63,6 +72,24 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		m_spurt = fxm::get_always<spu_runtime>();
 	}

+	// Don't lock without shared runtime
+	std::unique_lock<shared_mutex> lock(m_spurt->m_mutex, std::defer_lock);
+
+	if (g_cfg.core.spu_shared_runtime)
+	{
+		lock.lock();
+	}
+
+	// Try to find existing function
+	{
+		const auto found = m_spurt->m_map.find(func);
+
+		if (found != m_spurt->m_map.end() && found->second)
+		{
+			return found->second;
+		}
+	}
+
 	using namespace asmjit;

 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
@ -78,8 +105,9 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		fmt::append(log, "========== SPU BLOCK 0x%05x (size %u) ==========\n\n", func[0], func.size() - 1);
 	}

-	asmjit::CodeHolder code;
-	code.init(m_rt->getCodeInfo());
+	CodeHolder code;
+	code.init(m_spurt->m_jitrt.getCodeInfo());
+	code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign;

 	X86Assembler compiler(&code);
 	this->c = &compiler;
@ -626,7 +654,7 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 	c->align(kAlignCode, 16);
 	c->bind(label_diff);
 	c->inc(SPU_OFF_64(block_failure));
-	c->jmp(asmjit::imm_ptr(&spu_recompiler_base::dispatch));
+	c->jmp(imm_ptr(&spu_recompiler_base::dispatch));

 	for (auto&& work : decltype(after)(std::move(after)))
 	{
@ -648,15 +676,228 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 	// Compile and get function address
 	spu_function_t fn;

-	if (m_rt->add(&fn, &code))
+	if (m_spurt->m_jitrt.add(&fn, &code))
 	{
 		LOG_FATAL(SPU, "Failed to build a function");
 	}

+	// Register function
+	m_spurt->m_map[func] = fn;
+
+	// Generate a dispatcher (übertrampoline)
+	std::vector<u32> addrv{func[0]};
+	const auto beg = m_spurt->m_map.lower_bound(addrv);
+	addrv[0] += 4;
+	const auto end = m_spurt->m_map.lower_bound(addrv);
+	const u32 size0 = std::distance(beg, end);
+
+	if (size0 == 1)
+	{
+		m_spurt->m_dispatcher[func[0] / 4] = fn;
+	}
+	else
+	{
+		CodeHolder code;
+		code.init(m_spurt->m_jitrt.getCodeInfo());
+
+		X86Assembler compiler(&code);
+		this->c = &compiler;
+
+		if (g_cfg.core.spu_debug)
+		{
+			// Set logger
+			code.setLogger(&logger);
+		}
+
+		compiler.comment("\n\nTrampoline:\n\n");
+
+		struct work
+		{
+			u32 size;
+			u32 level;
+			Label label;
+			std::map<std::vector<u32>, spu_function_t>::iterator beg;
+			std::map<std::vector<u32>, spu_function_t>::iterator end;
+		};
+
+		std::vector<work> workload;
+		workload.reserve(size0);
+		workload.emplace_back();
+		workload.back().size = size0;
+		workload.back().level = 1;
+		workload.back().beg = beg;
+		workload.back().end = end;
+
+		for (std::size_t i = 0; i < workload.size(); i++)
+		{
+			// Get copy of the workload info
+			work w = workload[i];
+
+			// Split range in two parts
+			auto it = w.beg;
+			auto it2 = w.beg;
+			u32 size1 = w.size / 2;
+			u32 size2 = w.size - size1;
+			std::advance(it2, w.size / 2);
+
+			while (true)
+			{
+				it = it2;
+				size1 = w.size - size2;
+
+				// Adjust ranges (forward)
+				while (it != w.end && w.beg->first.at(w.level) == it->first.at(w.level))
+				{
+					it++;
+					size1++;
+				}
+
+				if (it == w.end)
+				{
+					// Cannot split: words are identical within the range at this level
+					w.level++;
+				}
+				else
+				{
+					size2 = w.size - size1;
+					break;
+				}
+			}
+
+			// Value for comparison
+			const u32 x = it->first.at(w.level);
+
+			// Adjust ranges (backward)
+			while (true)
+			{
+				it--;
+
+				if (it->first.at(w.level) != x)
+				{
+					it++;
+					break;
+				}
+
+				verify(HERE), it != w.beg;
+				size1--;
+				size2++;
+			}
+
+			if (w.label.isValid())
+			{
+				c->align(kAlignCode, 16);
+				c->bind(w.label);
+			}
+
+			c->cmp(x86::dword_ptr(*ls, func[0] + (w.level - 1) * 4), x);
+
+			// Low subrange target label
+			Label label_below;
+
+			if (size1 == 1)
+			{
+				label_below = c->newLabel();
+				c->jb(label_below);
+			}
+			else
+			{
+				workload.push_back(w);
+				workload.back().end = it;
+				workload.back().size = size1;
+				workload.back().label = c->newLabel();
+				c->jb(workload.back().label);
+			}
+
+			// Second subrange target
+			const auto target = it->second ? it->second : &dispatch;
+
+			if (size2 == 1)
+			{
+				c->jmp(imm_ptr(target));
+			}
+			else
+			{
+				it2 = it;
+
+				// Select additional midrange for equality comparison
+				while (it2 != w.end && it2->first.at(w.level) == x)
+				{
+					size2--;
+					it2++;
+				}
+
+				if (it2 != w.end)
+				{
+					// High subrange target label
+					Label label_above;
+
+					if (size2 == 1)
+					{
+						label_above = c->newLabel();
+						c->ja(label_above);
+					}
+					else
+					{
+						workload.push_back(w);
+						workload.back().beg = it2;
+						workload.back().size = size2;
+						workload.back().label = c->newLabel();
+						c->ja(workload.back().label);
+					}
+
+					const u32 size3 = w.size - size1 - size2;
+
+					if (size3 == 1)
+					{
+						c->jmp(imm_ptr(target));
+					}
+					else
+					{
+						workload.push_back(w);
+						workload.back().beg = it;
+						workload.back().end = it2;
+						workload.back().size = size3;
+						workload.back().label = c->newLabel();
+						c->jmp(workload.back().label);
+					}
+
+					if (label_above.isValid())
+					{
+						c->bind(label_above);
+						c->jmp(imm_ptr(it2->second ? it2->second : &dispatch));
+					}
+				}
+				else
+				{
+					workload.push_back(w);
+					workload.back().beg = it;
+					workload.back().size = w.size - size1;
+					workload.back().label = c->newLabel();
+					c->jmp(workload.back().label);
+				}
+			}
+
+			if (label_below.isValid())
+			{
+				c->bind(label_below);
+				c->jmp(imm_ptr(w.beg->second ? w.beg->second : &dispatch));
+			}
+		}
+
+		spu_function_t tr;
+
+		if (m_spurt->m_jitrt.add(&tr, &code))
+		{
+			LOG_FATAL(SPU, "Failed to build a trampoline");
+		}
+
+		m_spurt->m_dispatcher[func[0] / 4] = tr;
+	}
+
 	if (g_cfg.core.spu_debug)
 	{
 		// Add ASMJIT logs
-		fmt::append(log, "{%s} Address: %p\n\n", m_spu.get_name(), fn);
+		fmt::append(log, "Address: %p (%p)\n\n", fn, +m_spurt->m_dispatcher[func[0] / 4]);
 		log += logger.getString();
 		log += "\n\n\n";

@ -731,25 +972,24 @@ void spu_recompiler::branch_fixed(u32 target)
 	Label patch_point = c->newLabel();
 	c->lea(*qw0, x86::qword_ptr(patch_point));
 	c->mov(SPU_OFF_32(pc), target);
-	c->align(kAlignCode, 16);
+
+	// Need to emit exactly one executable instruction within 8 bytes
+	c->align(kAlignCode, 8);
 	c->bind(patch_point);

-	const auto result = m_spu.jit_map.emplace(block(m_spu, target), nullptr);
+	const auto result = m_spurt->m_map.emplace(block(m_spu, target), nullptr);

 	if (result.second || !result.first->second)
 	{
 		if (result.first->first.size())
 		{
 			// Target block hasn't been compiled yet, record overwriting position
-			c->mov(*ls, imm_ptr(&*result.first));
 			c->jmp(imm_ptr(&spu_recompiler_base::branch));
 		}
 		else
 		{
-			// SPURS Workload entry point or similar thing
-			c->mov(x86::r10, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher) + target * 2));
-			c->xor_(qw0->r32(), qw0->r32());
-			c->jmp(x86::r10);
+			// SPURS Workload entry point or similar thing (emit 8-byte NOP)
+			c->dq(0x841f0f);
 		}
 	}
 	else
@ -757,7 +997,14 @@ void spu_recompiler::branch_fixed(u32 target)
 		c->jmp(imm_ptr(result.first->second));
 	}

-	c->align(kAlignCode, 16);
+	// Branch via dispatcher (occupies 16 bytes including padding)
+	c->align(kAlignCode, 8);
+	c->mov(x86::rax, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher) + target * 2));
+	c->xor_(qw0->r32(), qw0->r32());
+	c->jmp(x86::rax);
+	c->align(kAlignCode, 8);
+	c->dq(reinterpret_cast<u64>(&*result.first));
+	c->dq(reinterpret_cast<u64>(result.first->second));
 }

 void spu_recompiler::branch_indirect(spu_opcode_t op)
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -32,13 +32,13 @@ public:
 // SPU ASMJIT Recompiler
 class spu_recompiler : public spu_recompiler_base
 {
-	const std::shared_ptr<asmjit::JitRuntime> m_rt;
-
 	std::shared_ptr<spu_runtime> m_spurt;

 public:
 	spu_recompiler(class SPUThread& spu);

+	virtual spu_function_t get(u32 lsa) override;
+
 	virtual spu_function_t compile(const std::vector<u32>& func) override;

 private:
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -16,9 +16,6 @@ spu_recompiler_base::spu_recompiler_base(SPUThread& spu)
 {
 	// Initialize lookup table
 	spu.jit_dispatcher.fill(&dispatch);
-
-	// Initialize "empty" block
-	spu.jit_map[std::vector<u32>()] = &dispatch;
 }

 spu_recompiler_base::~spu_recompiler_base()
@ -27,73 +24,89 @@ spu_recompiler_base::~spu_recompiler_base()

 void spu_recompiler_base::dispatch(SPUThread& spu, void*, u8* rip)
 {
-	const auto result = spu.jit_map.emplace(block(spu, spu.pc), nullptr);
-
-	if (result.second || !result.first->second)
+	// If check failed after direct branch, patch it with single NOP
+	if (rip)
 	{
-		result.first->second = spu.jit->compile(result.first->first);
+#ifdef _MSC_VER
+		*(volatile u64*)(rip) = 0x841f0f;
+#else
+		__atomic_store_n(reinterpret_cast<u64*>(rip), 0x841f0f, __ATOMIC_RELAXED);
+#endif
 	}

-	spu.jit_dispatcher[spu.pc / 4] = result.first->second;
+	const auto func = spu.jit->get(spu.pc);
+
+	// First attempt (load new trampoline and retry)
+	if (func != spu.jit_dispatcher[spu.pc / 4])
+	{
+		spu.jit_dispatcher[spu.pc / 4] = func;
+		return;
+	}
+
+	// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
+	if (spu.block_counter != spu.block_recover && func != &dispatch)
+	{
+		spu.block_recover = spu.block_counter;
+		return;
+	}
+
+	// Compile
+	verify(HERE), spu.jit->compile(block(spu, spu.pc));
+	spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc);
 }

-void spu_recompiler_base::branch(SPUThread& spu, std::pair<const std::vector<u32>, spu_function_t>* pair, u8* rip)
+void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip)
 {
+	const auto pair = *reinterpret_cast<std::pair<const std::vector<u32>, spu_function_t>**>(rip + 24);
+
 	spu.pc = pair->first[0];

-	if (!pair->second)
-	{
-		pair->second = spu.jit->compile(pair->first);
-	}
+	const auto func = pair->second ? pair->second : spu.jit->compile(pair->first);

-	spu.jit_dispatcher[spu.pc / 4] = pair->second;
+	verify(HERE), func, pair->second == func;
+
+	// Overwrite function address
+	reinterpret_cast<atomic_t<spu_function_t>*>(rip + 32)->store(func);

 	// Overwrite jump to this function with jump to the compiled function
-	const s64 rel = reinterpret_cast<u64>(pair->second) - reinterpret_cast<u64>(rip) - 5;
+	const s64 rel = reinterpret_cast<u64>(func) - reinterpret_cast<u64>(rip) - 5;
+
+	alignas(8) u8 bytes[8];

 	if (rel >= INT32_MIN && rel <= INT32_MAX)
 	{
 		const s64 rel8 = (rel + 5) - 2;

-		alignas(8) u8 bytes[8];
-
 		if (rel8 >= INT8_MIN && rel8 <= INT8_MAX)
 		{
 			bytes[0] = 0xeb; // jmp rel8
 			bytes[1] = static_cast<s8>(rel8);
-			std::memset(bytes + 2, 0x90, 5);
-			bytes[7] = 0x48;
+			std::memset(bytes + 2, 0x90, 6);
 		}
 		else
 		{
 			bytes[0] = 0xe9; // jmp rel32
 			std::memcpy(bytes + 1, &rel, 4);
-			std::memset(bytes + 5, 0x90, 2);
-			bytes[7] = 0x48;
+			std::memset(bytes + 5, 0x90, 3);
 		}
-
-#ifdef _MSC_VER
-		*(volatile u64*)(rip) = *reinterpret_cast<u64*>(+bytes);
-#else
-		__atomic_store_n(reinterpret_cast<u64*>(rip), *reinterpret_cast<u64*>(+bytes), __ATOMIC_RELAXED);
-#endif
 	}
 	else
 	{
-		alignas(16) u8 bytes[16];
-
-		bytes[0] = 0xff; // jmp [rip+2]
+		bytes[0] = 0xff; // jmp [rip+26]
 		bytes[1] = 0x25;
-		bytes[2] = 0x02;
+		bytes[2] = 0x1a;
 		bytes[3] = 0x00;
 		bytes[4] = 0x00;
 		bytes[5] = 0x00;
-		bytes[6] = 0x48; // mov rax, imm64 (not executed)
-		bytes[7] = 0xb8;
-		std::memcpy(bytes + 8, &pair->second, 8);
-
-		reinterpret_cast<atomic_t<u128>*>(rip)->store(*reinterpret_cast<u128*>(+bytes));
+		bytes[6] = 0x90;
+		bytes[7] = 0x90;
 	}
+
+#ifdef _MSC_VER
+	*(volatile u64*)(rip) = *reinterpret_cast<u64*>(+bytes);
+#else
+	__atomic_store_n(reinterpret_cast<u64*>(rip), *reinterpret_cast<u64*>(+bytes), __ATOMIC_RELAXED);
+#endif
 }

 std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa)
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -15,6 +15,9 @@ public:

 	virtual ~spu_recompiler_base();

+	// Get pointer to the trampoline at given position
+	virtual spu_function_t get(u32 lsa) = 0;
+
 	// Compile function
 	virtual spu_function_t compile(const std::vector<u32>& func) = 0;

@ -22,7 +25,7 @@ public:
 	static void dispatch(SPUThread&, void*, u8*);

 	// Direct branch fallback for non-compiled destination
-	static void branch(SPUThread&, std::pair<const std::vector<u32>, spu_function_t>*, u8* rip);
+	static void branch(SPUThread&, void*, u8*);

 	// Get the block at specified address
 	static std::vector<u32> block(SPUThread&, u32 lsa);
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -594,8 +594,6 @@ public:

 	std::unique_ptr<class spu_recompiler_base> jit; // Recompiler instance

-	std::map<std::vector<u32>, spu_function_t> jit_map; // All compiled blocks (first u32 is addr)
-
 	u64 block_counter = 0;
 	u64 block_recover = 0;
 	u64 block_failure = 0;
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@ -841,6 +841,12 @@ void Emulator::Load(bool add_only)
 				LOG_NOTICE(LOADER, "Elf path: %s", argv[0]);
 			}

+			if (g_cfg.core.spu_debug)
+			{
+				fs::file log(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite);
+				log.write(fmt::format("SPU JIT Log\n\nTitle: %s\nTitle ID: %s\n\n", Emu.GetTitle(), Emu.GetTitleID()));
+			}
+
 			ppu_load_exec(ppu_exec);

 			fxm::import<GSRender>(Emu.GetCallbacks().get_gs_render); // TODO: must be created in appropriate sys_rsx syscall
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@ -302,6 +302,7 @@ struct cfg_root : cfg::node
 		cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
 		cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
 		cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield
+		cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads

 		cfg::_enum<lib_loading_type> lib_loading{this, "Lib Loader", lib_loading_type::liblv2only};
 		cfg::_bool hook_functions{this, "Hook static functions"};