Implement SPU recompiler cache

Shared between ASMJIT/LLVM recompilers, compiled at startup
2025-04-20 11:36:13 +00:00 · 2018-05-05 00:01:27 +03:00 · 2018-05-05 00:01:27 +03:00 · fe4c3c4d84
commit fe4c3c4d84
parent f5ee6fb113
7 changed files with 282 additions and 59 deletions
--- a/rpcs3/Emu/Cell/PPUAnalyser.h
+++ b/rpcs3/Emu/Cell/PPUAnalyser.h
@ -71,6 +71,7 @@ struct ppu_module
 	uchar sha1[20];
 	std::string name;
 	std::string path;
+	std::string cache;
 	std::vector<ppu_reloc> relocs;
 	std::vector<ppu_segment> segs;
 	std::vector<ppu_segment> secs;
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -9,6 +9,7 @@
 #include "PPUInterpreter.h"
 #include "PPUAnalyser.h"
 #include "PPUModule.h"
+#include "SPURecompiler.h"
 #include "lv2/sys_sync.h"
 #include "lv2/sys_prx.h"
 #include "Utilities/GDBDebugServer.h"
@ -1086,6 +1087,22 @@ extern void ppu_initialize()
 		return;
 	}

+	// New PPU cache location
+	_main->cache = fmt::format("%sdata/%s/ppu-%s-%s/", fs::get_config_dir(), Emu.GetTitleID(), fmt::base57(_main->sha1), Emu.GetBoot().substr(Emu.GetBoot().find_last_of('/') + 1));
+
+	if (!fs::create_path(_main->cache))
+	{
+		fmt::throw_exception("Failed to create cache directory: %s (%s)", _main->cache, fs::g_tls_error);
+	}
+
+	// Initialize SPU cache
+	spu_cache::initialize();
+
+	if (Emu.IsStopped())
+	{
+		return;
+	}
+
 	// Initialize main module
 	ppu_initialize(*_main);

--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -52,25 +52,27 @@ spu_recompiler::spu_recompiler()
 	}
 }

-spu_function_t spu_recompiler::get(u32 lsa)
+void spu_recompiler::init()
 {
 	// Initialize if necessary
 	if (!m_spurt)
 	{
+		m_cache = fxm::get<spu_cache>();
 		m_spurt = fxm::get_always<spu_runtime>();
 	}
+}
+
+spu_function_t spu_recompiler::get(u32 lsa)
+{
+	init();

 	// Simple atomic read
 	return m_spurt->m_dispatcher[lsa / 4];
 }

-spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
+spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 {
-	// Initialize if necessary
-	if (!m_spurt)
-	{
-		m_spurt = fxm::get_always<spu_runtime>();
-	}
+	init();

 	// Don't lock without shared runtime
 	std::unique_lock<shared_mutex> lock(m_spurt->m_mutex, std::defer_lock);
@ -80,16 +82,18 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		lock.lock();
 	}

-	// Try to find existing function
-	{
-		const auto found = m_spurt->m_map.find(func);
+	// Try to find existing function, register new one if necessary
+	const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);

-		if (found != m_spurt->m_map.end() && found->second)
-		{
-			return found->second;
-		}
+	auto& fn_location = fn_info.first->second;
+
+	if (fn_location)
+	{
+		return fn_location;
 	}

+	auto& func = fn_info.first->first;
+
 	using namespace asmjit;

 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
@ -811,7 +815,7 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 	}

 	// Register function
-	m_spurt->m_map[func] = fn;
+	fn_location = fn;

 	// Generate a dispatcher (übertrampoline)
 	std::vector<u32> addrv{func[0]};
@ -1043,6 +1047,11 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::write + fs::append).write(log);
 	}

+	if (m_cache)
+	{
+		m_cache->add(func);
+	}
+
 	return fn;
 }

--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -37,9 +37,11 @@ class spu_recompiler : public spu_recompiler_base
 public:
 	spu_recompiler();

+	virtual void init() override;
+
 	virtual spu_function_t get(u32 lsa) override;

-	virtual spu_function_t compile(const std::vector<u32>& func) override;
+	virtual spu_function_t compile(std::vector<u32>&&) override;

 private:
 	// emitter:
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -1,14 +1,16 @@
-#include "stdafx.h"
+#include "stdafx.h"
 #include "Emu/System.h"
 #include "Emu/IdManager.h"
 #include "Emu/Memory/Memory.h"
 #include "Crypto/sha1.h"
+#include "Utilities/StrUtil.h"

 #include "SPUThread.h"
 #include "SPUAnalyser.h"
 #include "SPUInterpreter.h"
 #include "SPUDisAsm.h"
 #include "SPURecompiler.h"
+#include "PPUAnalyser.h"
 #include <algorithm>
 #include <mutex>
 #include <thread>
@ -17,6 +19,166 @@ extern u64 get_system_time();

 const spu_decoder<spu_itype> s_spu_itype;

+spu_cache::spu_cache(const std::string& loc)
+	: m_file(loc, fs::read + fs::write + fs::create)
+{
+}
+
+spu_cache::~spu_cache()
+{
+}
+
+std::vector<std::vector<u32>> spu_cache::get()
+{
+	std::vector<std::vector<u32>> result;
+
+	if (!m_file)
+	{
+		return result;
+	}
+
+	m_file.seek(0);
+
+	// TODO: signal truncated or otherwise broken file
+	while (true)
+	{
+		be_t<u32> size;
+		be_t<u32> addr;
+		std::vector<u32> func;
+
+		if (!m_file.read(size) || !m_file.read(addr))
+		{
+			break;
+		}
+
+		func.resize(size + 1);
+		func[0] = addr;
+
+		if (m_file.read(func.data() + 1, func.size() * 4 - 4) != func.size() * 4 - 4)
+		{
+			break;
+		}
+
+		result.emplace_back(std::move(func));
+	}
+
+	return result;
+}
+
+void spu_cache::add(const std::vector<u32>& func)
+{
+	if (!m_file)
+	{
+		return;
+	}
+
+	be_t<u32> size = ::size32(func) - 1;
+	be_t<u32> addr = func[0];
+	m_file.write(size);
+	m_file.write(addr);
+	m_file.write(func.data() + 1, func.size() * 4 - 4);
+}
+
+void spu_cache::initialize()
+{
+	const auto _main = fxm::get<ppu_module>();
+
+	if (!_main || !g_cfg.core.spu_shared_runtime)
+	{
+		return;
+	}
+
+	// SPU cache file (version + block size type)
+	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v0.dat";
+
+	auto cache = std::make_shared<spu_cache>(loc);
+
+	if (!*cache)
+	{
+		LOG_ERROR(SPU, "Failed to initialize SPU cache at: %s", loc);
+		return;
+	}
+
+	// Read cache
+	auto func_list = cache->get();
+
+	// Recompiler instance for cache initialization
+	std::unique_ptr<spu_recompiler_base> compiler;
+
+	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
+	{
+		compiler = spu_recompiler_base::make_asmjit_recompiler();
+	}
+
+	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	{
+		compiler = spu_recompiler_base::make_llvm_recompiler();
+	}
+
+	if (compiler)
+	{
+		compiler->init();
+	}
+
+	if (compiler && !func_list.empty())
+	{
+		// Fake LS
+		std::vector<be_t<u32>> ls(0x10000);
+
+		// Used to show progress
+		u64 timex = get_system_time();
+
+		// Build functions
+		for (auto&& func : func_list)
+		{
+			// Initialize LS with function data only
+			for (u32 i = 1, pos = func[0]; i < func.size(); i++, pos += 4)
+			{
+				ls[pos / 4] = se_storage<u32>::swap(func[i]);
+			}
+
+			// Call analyser
+			std::vector<u32> func2 = compiler->block(ls.data(), func[0]);
+
+			compiler->compile(std::move(func));
+
+			// Clear fake LS
+			for (u32 i = 1, pos = func2[0]; i < func2.size(); i++, pos += 4)
+			{
+				if (se_storage<u32>::swap(func2[i]) != ls[pos / 4])
+				{
+					LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed at 0x%x", func2[0], pos);
+				}
+
+				ls[pos / 4] = 0;
+			}
+
+			if (Emu.IsStopped())
+			{
+				LOG_ERROR(SPU, "SPU Runtime: Cache building aborted.");
+				return;
+			}
+
+			// Print progress every 400 ms
+			const u64 timed = get_system_time() - timex;
+
+			if (timed >= 400000)
+			{
+				LOG_SUCCESS(SPU, "Building SPU cache (%u/%u)...", &func - func_list.data(), func_list.size());
+				timex += 400000;
+			}
+		}
+
+		LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
+	}
+
+	// Register cache instance
+	fxm::import<spu_cache>([&]() -> std::shared_ptr<spu_cache>&&
+	{
+		return std::move(cache);
+	});
+}
+
 spu_recompiler_base::spu_recompiler_base()
 {
 }
@ -54,14 +216,14 @@ void spu_recompiler_base::dispatch(SPUThread& spu, void*, u8* rip)
 	}

 	// Compile
-	verify(HERE), spu.jit->compile(block(spu, spu.pc, &spu.jit->m_block_info));
+	verify(HERE), spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc));
 	spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc);
 }

 void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip)
 {
 	// Compile
-	const auto func = verify(HERE, spu.jit->compile(block(spu, spu.pc, &spu.jit->m_block_info)));
+	const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc)));
 	spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc);

 	// Overwrite jump to this function with jump to the compiled function
@ -102,23 +264,16 @@ void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip)
 #endif
 }

-std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa, std::bitset<0x10000>* out_info)
+std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 {
-	// Block info (local)
-	std::bitset<0x10000> block_info{};
-
-	// Select one to use
-	std::bitset<0x10000>& blocks = out_info ? *out_info : block_info;
-
-	if (out_info)
-	{
-		out_info->reset();
-	}
-
 	// Result: addr + raw instruction data
 	std::vector<u32> result;
 	result.reserve(256);
 	result.push_back(lsa);
+
+	// Initialize block entries
+	std::bitset<0x10000>& blocks = m_block_info;
+	blocks.reset();
 	blocks.set(lsa / 4);

 	// Simple block entry workload list
@ -140,13 +295,6 @@ std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa, std::bitset
 	// Associated constant values for 32-bit preferred slot
 	std::array<u32, 128> values;

-	if (spu.pc == lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
-	{
-		// TODO: use current register values for speculations
-		vflags[0] = +vf::is_const;
-		values[0] = spu.gpr[0]._u32[3];
-	}
-
 	for (u32 wi = 0; wi < wl.size();)
 	{
 		const auto next_block = [&]
@ -172,7 +320,7 @@ std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa, std::bitset
 		};

 		const u32 pos = wl[wi];
-		const u32 data = spu._ref<u32>(pos);
+		const u32 data = ls[pos / 4];
 		const auto op = spu_opcode_t{data};

 		wl[wi] += 4;
@ -272,7 +420,7 @@ std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa, std::bitset

 				for (u32 i = start; i < limit; i += 4)
 				{
-					const u32 target = spu._ref<u32>(i);
+					const u32 target = ls[i / 4];

 					if (target == 0 || target % 4)
 					{
@ -542,7 +690,7 @@ std::vector<u32> spu_recompiler_base::block(SPUThread& spu, u32 lsa, std::bitset
 			if (result[i] == 0)
 			{
 				const u32 pos = lsa + (i - 1) * 4;
-				const u32 data = spu._ref<u32>(pos);
+				const u32 data = ls[pos / 4];
 				const auto type = s_spu_itype.decode(data);

 				// Allow only NOP or LNOP instructions in holes
@ -597,13 +745,14 @@ class spu_llvm_runtime
 	// JIT instance (TODO: use small code model)
 	jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu), true};

+	// Debug module output location
+	std::string m_cache_path;
+
 	friend class spu_llvm_recompiler;

 public:
 	spu_llvm_runtime()
 	{
-		LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
-
 		// Initialize lookup table
 		for (auto& v : m_dispatcher)
 		{
@ -612,6 +761,13 @@ public:

 		// Initialize "empty" block
 		m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
+
+		// Clear LLVM output
+		m_cache_path = fxm::check_unlocked<ppu_module>()->cache + "llvm/";
+		fs::create_dir(m_cache_path);
+		fs::remove_all(m_cache_path, false);
+
+		LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
 	}
 };

@ -791,27 +947,28 @@ public:
 		}
 	}

-	virtual spu_function_t get(u32 lsa) override
+	virtual void init() override
 	{
 		// Initialize if necessary
 		if (!m_spurt)
 		{
+			m_cache = fxm::get<spu_cache>();
 			m_spurt = fxm::get_always<spu_llvm_runtime>();
 			m_context = m_spurt->m_jit.get_context();
 		}
+	}
+
+	virtual spu_function_t get(u32 lsa) override
+	{
+		init();

 		// Simple atomic read
 		return m_spurt->m_dispatcher[lsa / 4];
 	}

-	virtual spu_function_t compile(const std::vector<u32>& func) override
+	virtual spu_function_t compile(std::vector<u32>&& func_rv) override
 	{
-		// Initialize if necessary
-		if (!m_spurt)
-		{
-			m_spurt = fxm::get_always<spu_llvm_runtime>();
-			m_context = m_spurt->m_jit.get_context();
-		}
+		init();

 		// Don't lock without shared runtime
 		std::unique_lock<shared_mutex> lock(m_spurt->m_mutex, std::defer_lock);
@ -821,14 +978,18 @@ public:
 			lock.lock();
 		}

-		// Try to find existing function, register new
-		auto& fn_location = m_spurt->m_map[func];
+		// Try to find existing function, register new one if necessary
+		const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
+
+		auto& fn_location = fn_info.first->second;

 		if (fn_location)
 		{
 			return fn_location;
 		}

+		auto& func = fn_info.first->first;
+
 		std::string hash;
 		{
 			sha1_context ctx;
@ -855,7 +1016,7 @@ public:
 		}

 		// Create LLVM module
-		std::unique_ptr<Module> module = std::make_unique<Module>(hash, m_context);
+		std::unique_ptr<Module> module = std::make_unique<Module>(hash + ".obj", m_context);

 		// Initialize target
 		module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
@ -1244,7 +1405,7 @@ public:
 		if (g_cfg.core.spu_debug)
 		{
 			// Testing only
-			m_spurt->m_jit.add(std::move(module), fmt::format("%sSPU/%s.obj", Emu.GetCachePath(), hash));
+			m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path);
 		}
 		else
 		{
@ -1277,6 +1438,11 @@ public:
 			fs::file(Emu.GetCachePath() + "SPU.log", fs::write + fs::append).write(log);
 		}

+		if (m_cache)
+		{
+			m_cache->add(func);
+		}
+
 		return fn;
 	}

--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -1,7 +1,32 @@
 #pragma once

+#include "Utilities/File.h"
 #include "SPUThread.h"
+#include <vector>
 #include <bitset>
+#include <memory>
+
+// Helper class
+class spu_cache
+{
+	fs::file m_file;
+
+public:
+	spu_cache(const std::string& loc);
+
+	~spu_cache();
+
+	operator bool() const
+	{
+		return m_file.operator bool();
+	}
+
+	std::vector<std::vector<u32>> get();
+
+	void add(const std::vector<u32>& func);
+
+	static void initialize();
+};

 // SPU Recompiler instance base class
 class spu_recompiler_base
@ -12,16 +37,21 @@ protected:

 	std::bitset<0x10000> m_block_info;

+	std::shared_ptr<spu_cache> m_cache;
+
 public:
 	spu_recompiler_base();

 	virtual ~spu_recompiler_base();

+	// Initialize
+	virtual void init() = 0;
+
 	// Get pointer to the trampoline at given position
 	virtual spu_function_t get(u32 lsa) = 0;

 	// Compile function
-	virtual spu_function_t compile(const std::vector<u32>& func) = 0;
+	virtual spu_function_t compile(std::vector<u32>&&) = 0;

 	// Default dispatch function fallback (second arg is unused)
 	static void dispatch(SPUThread&, void*, u8* rip);
@ -30,7 +60,7 @@ public:
 	static void branch(SPUThread&, void*, u8* rip);

 	// Get the block at specified address
-	static std::vector<u32> block(SPUThread&, u32 lsa, std::bitset<0x10000>* = nullptr);
+	std::vector<u32> block(const be_t<u32>* ls, u32 lsa);

 	// Create recompiler instance (ASMJIT)
 	static std::unique_ptr<spu_recompiler_base> make_asmjit_recompiler();
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@ -871,8 +871,6 @@ void Emulator::Load(bool add_only)
 				}

 				log.write(fmt::format("SPU JIT Log\n\nTitle: %s\nTitle ID: %s\n\n", Emu.GetTitle(), Emu.GetTitleID()));
-				fs::create_dir(Emu.GetCachePath() + "SPU");
-				fs::remove_all(Emu.GetCachePath() + "SPU", false);
 			}

 			ppu_load_exec(ppu_exec);