From 12eee6a19eb0b04c9e64d531bd7bb36c20d96732 Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Mon, 4 Jun 2018 00:20:14 +0300
Subject: [PATCH] SPU ASMJIT: Implement Mega block mode (experimental)

Disable extra modes for SPU LLVM for now.
In Mega mode, SPU Analyser tries to determine complete functions.
Recompiler tries to speed up returns via 'stack mirror'.
---
 rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 84 ++++++++++++++++++++++++--
 rpcs3/Emu/Cell/SPUASMJITRecompiler.h   |  3 +-
 rpcs3/Emu/Cell/SPURecompiler.cpp       | 21 +++++--
 rpcs3/Emu/Cell/SPUThread.cpp           | 13 +++-
 rpcs3/Emu/Cell/SPUThread.h             |  2 +
 5 files changed, 109 insertions(+), 14 deletions(-)
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index d358085a54..d41d7992d3 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -730,6 +730,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 		if (found != instr_labels.end())
 		{
+			if (m_preds.count(pos))
+			{
+				c->align(kAlignCode, 16);
+			}
+
 			c->bind(found->second);
 		}
 
@@ -1118,11 +1123,22 @@ static void check_state_ret(SPUThread& _spu, void*, u8*)
 
 static void check_state(SPUThread* _spu, spu_function_t _ret)
 {
-	if (_spu->check_state())
+	if (test(_spu->state) && _spu->check_state())
 	{
 		_ret = &check_state_ret;
 	}
 
+	if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+	{
+		// Get stack pointer, try to use native return address (check SPU return address)
+		const auto x = _spu->stack_mirror[(_spu->gpr[1]._u32[3] & 0x3fff0) >> 4];
+
+		if (x._u32[2] == _spu->pc)
+		{
+			_ret = reinterpret_cast<spu_function_t>(x._u64[0]);
+		}
+	}
+
 	_ret(*_spu, _spu->_ptr<u8>(0), nullptr);
 }
 
@@ -1172,11 +1188,11 @@ void spu_recompiler::branch_fixed(u32 target)
 	c->jmp(x86::rax);
 }
 
-void spu_recompiler::branch_indirect(spu_opcode_t op, bool local)
+void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 {
 	using namespace asmjit;
 
-	if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !local)
+	if (g_cfg.core.spu_block_size != spu_block_size_type::giga && !jt)
 	{
 		// Simply external call (return or indirect call)
 		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
@@ -1238,12 +1254,59 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool local)
 	c->mov(SPU_OFF_32(pc), *addr);
 	c->cmp(SPU_OFF_32(state), 0);
 	c->jnz(label_check);
+
+	if (g_cfg.core.spu_block_size != spu_block_size_type::safe && ret)
+	{
+		// Get stack pointer, try to use native return address (check SPU return address)
+		c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+		c->and_(qw1->r32(), 0x3fff0);
+		c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&SPUThread::stack_mirror)));
+		c->cmp(x86::dword_ptr(*qw1, 8), *addr);
+		c->cmove(x86::r10, x86::qword_ptr(*qw1));
+	}
+
 	c->jmp(x86::r10);
 	c->bind(label_check);
 	c->mov(*ls, x86::r10);
 	c->jmp(imm_ptr(&check_state));
 }
 
+void spu_recompiler::branch_set_link(u32 target)
+{
+	using namespace asmjit;
+
+	if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+	{
+		// Find instruction at target
+		const auto local = instr_labels.find(target);
+
+		if (local != instr_labels.end() && local->second.isValid())
+		{
+			Label ret = c->newLabel();
+
+			// Get stack pointer, write native and SPU return addresses into the stack mirror
+			c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+			c->and_(qw1->r32(), 0x3fff0);
+			c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&SPUThread::stack_mirror)));
+			c->lea(x86::r10, x86::qword_ptr(ret));
+			c->mov(x86::qword_ptr(*qw1, 0), x86::r10);
+			c->mov(x86::qword_ptr(*qw1, 8), target);
+
+			after.emplace_back([=, target = local->second]
+			{
+				// Clear return info after use
+				c->align(kAlignCode, 16);
+				c->bind(ret);
+				c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+				c->and_(qw1->r32(), 0x3fff0);
+				c->pcmpeqd(x86::xmm0, x86::xmm0);
+				c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&SPUThread::stack_mirror)), x86::xmm0);
+				c->jmp(target);
+			});
+		}
+	}
+}
+
 void spu_recompiler::fall(spu_opcode_t op)
 {
 	auto gate = [](SPUThread* _spu, u32 opcode, spu_inter_func_t _func, spu_function_t _ret)
@@ -2768,9 +2831,17 @@ void spu_recompiler::STQX(spu_opcode_t op)
 
 void spu_recompiler::BI(spu_opcode_t op)
 {
+	const auto found = m_targets.find(m_pos);
+	const auto is_jt = found == m_targets.end() || found->second.size() != 1 || found->second.front() != -1;
+
+	if (found == m_targets.end() || found->second.empty())
+	{
+		LOG_ERROR(SPU, "[0x%x] BI: no targets", m_pos);
+	}
+
 	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
 	c->and_(*addr, 0x3fffc);
-	branch_indirect(op, m_targets.find(m_pos) != m_targets.end());
+	branch_indirect(op, is_jt, !is_jt);
 	m_pos = -1;
 }
 
@@ -2781,7 +2852,8 @@ void spu_recompiler::BISL(spu_opcode_t op)
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	branch_indirect(op, m_targets.find(m_pos) != m_targets.end());
+	branch_set_link(m_pos + 4);
+	branch_indirect(op, true, false);
 	m_pos = -1;
 }
 
@@ -4282,6 +4354,7 @@ void spu_recompiler::BRASL(spu_opcode_t op)
 
 	if (target != m_pos + 4)
 	{
+		branch_set_link(m_pos + 4);
 		branch_fixed(target);
 		m_pos = -1;
 	}
@@ -4319,6 +4392,7 @@ void spu_recompiler::BRSL(spu_opcode_t op)
 
 	if (target != m_pos + 4)
 	{
+		branch_set_link(m_pos + 4);
 		branch_fixed(target);
 		m_pos = -1;
 	}
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
index a2c77a5e75..6388cb157c 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@@ -103,7 +103,8 @@ private:
 	asmjit::X86Mem XmmConst(__m128i data);
 
 	void branch_fixed(u32 target);
-	void branch_indirect(spu_opcode_t op, bool local = false);
+	void branch_indirect(spu_opcode_t op, bool jt = false, bool ret = true);
+	void branch_set_link(u32 target);
 	void fall(spu_opcode_t op);
 	void save_rcx();
 	void load_rcx();
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index 47968c3a7f..8768a1fd65 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -90,6 +90,12 @@ void spu_cache::initialize()
 		return;
 	}
 
+	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	{
+		// Force Safe mode
+		g_cfg.core.spu_block_size.from_default();
+	}
+
 	// SPU cache file (version + block size type)
 	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v3.dat";
 
@@ -384,7 +390,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				continue;
 			}
 
-			if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+			if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
 			{
 				// Stop on special instructions (TODO)
 				m_targets[pos].push_back(-1);
@@ -437,8 +443,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 					add_block(target);
 				}
 
-				if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
+				if (type == spu_itype::BISL && g_cfg.core.spu_block_size != spu_block_size_type::safe)
 				{
+					m_targets[pos].push_back(pos + 4);
 					add_block(pos + 4);
 				}
 			}
@@ -548,7 +555,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 
 			if (type == spu_itype::BI || type == spu_itype::BISL)
 			{
-				if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga)
+				if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe)
 				{
 					if (m_targets[pos].empty())
 					{
@@ -557,6 +564,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				}
 				else
 				{
+					m_targets[pos].push_back(pos + 4);
 					add_block(pos + 4);
 				}
 			}
@@ -587,8 +595,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 
 			m_targets[pos].push_back(target);
 
-			if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
+			if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
 			{
+				m_targets[pos].push_back(pos + 4);
 				add_block(pos + 4);
 			}
 
@@ -803,11 +812,11 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		}
 	}
 
-	while (g_cfg.core.spu_block_size == spu_block_size_type::safe)
+	while (g_cfg.core.spu_block_size != spu_block_size_type::giga)
 	{
 		const u32 initial_size = result.size();
 
-		// Check unreachable blocks in safe mode (TODO)
+		// Check unreachable blocks in safe and mega modes (TODO)
 		u32 limit = lsa + result.size() * 4 - 4;
 
 		for (auto& pair : m_preds)
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 069af53ca3..514ca91f4d 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -637,8 +637,17 @@ SPUThread::SPUThread(const std::string& name, u32 index, lv2_spu_group* group)
 		jit = spu_recompiler_base::make_llvm_recompiler();
 	}
 
-	// Initialize lookup table
-	jit_dispatcher.fill(&spu_recompiler_base::dispatch);
+	if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise)
+	{
+		// Initialize lookup table
+		jit_dispatcher.fill(&spu_recompiler_base::dispatch);
+
+		if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+		{
+			// Initialize stack mirror
+			std::memset(stack_mirror.data(), 0xff, sizeof(stack_mirror));
+		}
+	}
 }
 
 void SPUThread::push_snr(u32 number, u32 value)
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index 061adee884..baee513c58 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -588,6 +588,8 @@ public:
 
 	std::array<spu_function_t, 0x10000> jit_dispatcher; // Dispatch table for indirect calls
 
+	std::array<v128, 0x4000> stack_mirror; // Return address information
+
 	void push_snr(u32 number, u32 value);
 	void do_dma_transfer(const spu_mfc_cmd& args);
 	bool do_dma_check(const spu_mfc_cmd& args);