diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index d358085a54..d41d7992d3 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -730,6 +730,11 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 		if (found != instr_labels.end())
 		{
+			if (m_preds.count(pos))
+			{
+				c->align(kAlignCode, 16);
+			}
+
 			c->bind(found->second);
 		}
 
@@ -1118,11 +1123,22 @@ static void check_state_ret(SPUThread& _spu, void*, u8*)
 
 static void check_state(SPUThread* _spu, spu_function_t _ret)
 {
-	if (_spu->check_state())
+	if (test(_spu->state) && _spu->check_state())
 	{
 		_ret = &check_state_ret;
 	}
 
+	if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+	{
+		// Get stack pointer, try to use native return address (check SPU return address)
+		const auto x = _spu->stack_mirror[(_spu->gpr[1]._u32[3] & 0x3fff0) >> 4];
+
+		if (x._u32[2] == _spu->pc)
+		{
+			_ret = reinterpret_cast<spu_function_t>(x._u64[0]);
+		}
+	}
+
 	_ret(*_spu, _spu->_ptr<u8>(0), nullptr);
 }
 
@@ -1172,11 +1188,11 @@ void spu_recompiler::branch_fixed(u32 target)
 	c->jmp(x86::rax);
 }
 
-void spu_recompiler::branch_indirect(spu_opcode_t op, bool local)
+void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 {
 	using namespace asmjit;
 
-	if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !local)
+	if (g_cfg.core.spu_block_size != spu_block_size_type::giga && !jt)
 	{
 		// Simply external call (return or indirect call)
 		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
@@ -1238,12 +1254,59 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool local)
 	c->mov(SPU_OFF_32(pc), *addr);
 	c->cmp(SPU_OFF_32(state), 0);
 	c->jnz(label_check);
+
+	if (g_cfg.core.spu_block_size != spu_block_size_type::safe && ret)
+	{
+		// Get stack pointer, try to use native return address (check SPU return address)
+		c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+		c->and_(qw1->r32(), 0x3fff0);
+		c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&SPUThread::stack_mirror)));
+		c->cmp(x86::dword_ptr(*qw1, 8), *addr);
+		c->cmove(x86::r10, x86::qword_ptr(*qw1));
+	}
+
 	c->jmp(x86::r10);
 	c->bind(label_check);
 	c->mov(*ls, x86::r10);
 	c->jmp(imm_ptr(&check_state));
 }
 
+void spu_recompiler::branch_set_link(u32 target)
+{
+	using namespace asmjit;
+
+	if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+	{
+		// Find instruction at target
+		const auto local = instr_labels.find(target);
+
+		if (local != instr_labels.end() && local->second.isValid())
+		{
+			Label ret = c->newLabel();
+
+			// Get stack pointer, write native and SPU return addresses into the stack mirror
+			c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+			c->and_(qw1->r32(), 0x3fff0);
+			c->lea(*qw1, x86::qword_ptr(*cpu, *qw1, 0, ::offset32(&SPUThread::stack_mirror)));
+			c->lea(x86::r10, x86::qword_ptr(ret));
+			c->mov(x86::qword_ptr(*qw1, 0), x86::r10);
+			c->mov(x86::qword_ptr(*qw1, 8), target);
+
+			after.emplace_back([=, target = local->second]
+			{
+				// Clear return info after use
+				c->align(kAlignCode, 16);
+				c->bind(ret);
+				c->mov(qw1->r32(), SPU_OFF_32(gpr, 1, &v128::_u32, 3));
+				c->and_(qw1->r32(), 0x3fff0);
+				c->pcmpeqd(x86::xmm0, x86::xmm0);
+				c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&SPUThread::stack_mirror)), x86::xmm0);
+				c->jmp(target);
+			});
+		}
+	}
+}
+
 void spu_recompiler::fall(spu_opcode_t op)
 {
 	auto gate = [](SPUThread* _spu, u32 opcode, spu_inter_func_t _func, spu_function_t _ret)
@@ -2768,9 +2831,17 @@ void spu_recompiler::STQX(spu_opcode_t op)
 
 void spu_recompiler::BI(spu_opcode_t op)
 {
+	const auto found = m_targets.find(m_pos);
+	const auto is_jt = found == m_targets.end() || found->second.size() != 1 || found->second.front() != -1;
+
+	if (found == m_targets.end() || found->second.empty())
+	{
+		LOG_ERROR(SPU, "[0x%x] BI: no targets", m_pos);
+	}
+
 	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
 	c->and_(*addr, 0x3fffc);
-	branch_indirect(op, m_targets.find(m_pos) != m_targets.end());
+	branch_indirect(op, is_jt, !is_jt);
 	m_pos = -1;
 }
 
@@ -2781,7 +2852,8 @@ void spu_recompiler::BISL(spu_opcode_t op)
 	const XmmLink& vr = XmmAlloc();
 	c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0)));
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
-	branch_indirect(op, m_targets.find(m_pos) != m_targets.end());
+	branch_set_link(m_pos + 4);
+	branch_indirect(op, true, false);
 	m_pos = -1;
 }
 
@@ -4282,6 +4354,7 @@ void spu_recompiler::BRASL(spu_opcode_t op)
 
 	if (target != m_pos + 4)
 	{
+		branch_set_link(m_pos + 4);
 		branch_fixed(target);
 		m_pos = -1;
 	}
@@ -4319,6 +4392,7 @@ void spu_recompiler::BRSL(spu_opcode_t op)
 
 	if (target != m_pos + 4)
 	{
+		branch_set_link(m_pos + 4);
 		branch_fixed(target);
 		m_pos = -1;
 	}
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
index a2c77a5e75..6388cb157c 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@@ -103,7 +103,8 @@ private:
 	asmjit::X86Mem XmmConst(__m128i data);
 
 	void branch_fixed(u32 target);
-	void branch_indirect(spu_opcode_t op, bool local = false);
+	void branch_indirect(spu_opcode_t op, bool jt = false, bool ret = true);
+	void branch_set_link(u32 target);
 	void fall(spu_opcode_t op);
 	void save_rcx();
 	void load_rcx();
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index 47968c3a7f..8768a1fd65 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -90,6 +90,12 @@ void spu_cache::initialize()
 		return;
 	}
 
+	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	{
+		// Force Safe mode
+		g_cfg.core.spu_block_size.from_default();
+	}
+
 	// SPU cache file (version + block size type)
 	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v3.dat";
 
@@ -384,7 +390,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				continue;
 			}
 
-			if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+			if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
 			{
 				// Stop on special instructions (TODO)
 				m_targets[pos].push_back(-1);
@@ -437,8 +443,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 					add_block(target);
 				}
 
-				if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
+				if (type == spu_itype::BISL && g_cfg.core.spu_block_size != spu_block_size_type::safe)
 				{
+					m_targets[pos].push_back(pos + 4);
 					add_block(pos + 4);
 				}
 			}
@@ -548,7 +555,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 
 			if (type == spu_itype::BI || type == spu_itype::BISL)
 			{
-				if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga)
+				if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe)
 				{
 					if (m_targets[pos].empty())
 					{
@@ -557,6 +564,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				}
 				else
 				{
+					m_targets[pos].push_back(pos + 4);
 					add_block(pos + 4);
 				}
 			}
@@ -587,8 +595,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 
 			m_targets[pos].push_back(target);
 
-			if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
+			if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
 			{
+				m_targets[pos].push_back(pos + 4);
 				add_block(pos + 4);
 			}
 
@@ -803,11 +812,11 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		}
 	}
 
-	while (g_cfg.core.spu_block_size == spu_block_size_type::safe)
+	while (g_cfg.core.spu_block_size != spu_block_size_type::giga)
 	{
 		const u32 initial_size = result.size();
 
-		// Check unreachable blocks in safe mode (TODO)
+		// Check unreachable blocks in safe and mega modes (TODO)
 		u32 limit = lsa + result.size() * 4 - 4;
 
 		for (auto& pair : m_preds)
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 069af53ca3..514ca91f4d 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -637,8 +637,17 @@ SPUThread::SPUThread(const std::string& name, u32 index, lv2_spu_group* group)
 		jit = spu_recompiler_base::make_llvm_recompiler();
 	}
 
-	// Initialize lookup table
-	jit_dispatcher.fill(&spu_recompiler_base::dispatch);
+	if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise)
+	{
+		// Initialize lookup table
+		jit_dispatcher.fill(&spu_recompiler_base::dispatch);
+
+		if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+		{
+			// Initialize stack mirror
+			std::memset(stack_mirror.data(), 0xff, sizeof(stack_mirror));
+		}
+	}
 }
 
 void SPUThread::push_snr(u32 number, u32 value)
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index 061adee884..baee513c58 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -588,6 +588,8 @@ public:
 
 	std::array<spu_function_t, 0x10000> jit_dispatcher; // Dispatch table for indirect calls
 
+	std::array<v128, 0x4000> stack_mirror; // Return address information
+
 	void push_snr(u32 number, u32 value);
 	void do_dma_transfer(const spu_mfc_cmd& args);
 	bool do_dma_check(const spu_mfc_cmd& args);