SPU: Make recompilers lock-free.

2025-08-22 02:09:30 +00:00 · 2019-10-26 00:52:56 +03:00 · 2019-10-26 00:52:56 +03:00 · 8c28c4e8ec
commit 8c28c4e8ec
parent 9ac6ef6494
4 changed files with 248 additions and 260 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -45,24 +45,23 @@ void spu_recompiler::init()
 	}
 }

-spu_function_t spu_recompiler::compile(const std::vector<u32>& func, void* fn_location)
+spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 {
-	if (!fn_location)
-	{
-		fn_location = m_spurt->find(func);
-	}
+	const auto add_loc = m_spurt->add_empty(std::move(_func));

-	if (fn_location == spu_runtime::g_dispatcher)
-	{
-		return &dispatch;
-	}
-
-	if (!fn_location)
+	if (!add_loc)
 	{
 		return nullptr;
 	}

-	if (auto cache = g_fxo->get<spu_cache>(); cache && g_cfg.core.spu_cache)
+	if (add_loc->compiled)
+	{
+		return add_loc->compiled;
+	}
+
+	const std::vector<u32>& func = add_loc->data;
+
+	if (auto cache = g_fxo->get<spu_cache>(); cache && g_cfg.core.spu_cache && !add_loc->cached.exchange(1))
 	{
 		cache->add(func);
 	}
@ -94,10 +93,10 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func, void* fn_lo
 	X86Assembler compiler(&code);
 	this->c = &compiler;

-	if (g_cfg.core.spu_debug)
+	if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
 	{
 		// Dump analyser data
-		this->dump(log);
+		this->dump(func, log);
 		fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);

 		// Set logger
@ -892,12 +891,21 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func, void* fn_lo
 		LOG_FATAL(SPU, "Failed to build a function");
 	}

-	if (!m_spurt->add(fn_location, fn))
+	// Install compiled function pointer
+	const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
+
+	// Rebuild trampoline if necessary
+	if (!m_spurt->rebuild_ubertrampoline(func[1]))
 	{
 		return nullptr;
 	}

-	if (g_cfg.core.spu_debug)
+	if (added)
+	{
+		add_loc->compiled.notify_all();
+	}
+
+	if (g_cfg.core.spu_debug && added)
 	{
 		// Add ASMJIT logs
 		fmt::append(log, "Address: %p\n\n", fn);
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -13,7 +13,7 @@ public:

 	virtual void init() override;

-	virtual spu_function_t compile(const std::vector<u32>&, void*) override;
+	virtual spu_function_t compile(std::vector<u32>&&) override;

 private:
 	// ASMJIT runtime
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -400,15 +400,19 @@ void spu_cache::initialize()
 		{
 			compiler->init();

-			if (compiler->compile({}, nullptr) && spu_runtime::g_interpreter)
+			if (compiler->compile({}) && spu_runtime::g_interpreter)
 			{
-				LOG_SUCCESS(SPU, "SPU Runtime: built interpreter.");
+				LOG_SUCCESS(SPU, "SPU Runtime: Built the interpreter.");

 				if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
 				{
 					return;
 				}
 			}
+			else
+			{
+				LOG_FATAL(SPU, "SPU Runtime: Failed to build the interpreter.");
+			}
 		}
 	}

@ -472,34 +476,21 @@ void spu_cache::initialize()
 			}

 			// Call analyser
-			const std::vector<u32>& func2 = compiler->analyse(ls.data(), func[0]);
+			std::vector<u32> func2 = compiler->analyse(ls.data(), func[0]);

 			if (func2.size() != size0)
 			{
 				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
 			}

-			if (!compiler->compile(func, nullptr))
+			if (!compiler->compile(std::move(func2)))
 			{
 				// Likely, out of JIT memory. Signal to prevent further building.
 				fail_flag |= 1;
 			}

 			// Clear fake LS
-			for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4)
-			{
-				if (se_storage<u32>::swap(func2[i]) != ls[pos / 4])
-				{
-					LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed at 0x%x", func2[0], pos);
-				}
-
-				ls[pos / 4] = 0;
-			}
-
-			if (func2.size() != size0)
-			{
-				std::memset(ls.data(), 0, 0x40000);
-			}
+			std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));

 			g_progr_pdone++;
 		}
@ -519,7 +510,7 @@ void spu_cache::initialize()

 	if (fail_flag)
 	{
-		LOG_ERROR(SPU, "SPU Runtime: Cache building failed (too much data). SPU Cache will be disabled.");
+		LOG_FATAL(SPU, "SPU Runtime: Cache building failed (out of memory).");
 		return;
 	}

@ -581,9 +572,6 @@ bool spu_runtime::func_compare::operator()(const std::vector<u32>& lhs, const st

 spu_runtime::spu_runtime()
 {
-	// Initialize "empty" block
-	m_map[std::vector<u32>()] = tr_dispatch;
-
 	// Clear LLVM output
 	m_cache_path = Emu.PPUCache();

@ -602,60 +590,77 @@ spu_runtime::spu_runtime()
 	}
 }

-bool spu_runtime::add(void* _where, spu_function_t compiled)
+spu_item* spu_runtime::add_empty(std::vector<u32>&& data)
 {
-	writer_lock lock(*this);
-
-	if (!_where)
+	if (data.size() <= 1)
 	{
-		return false;
+		return nullptr;
 	}

-	// Use opaque pointer
-	auto& where = *static_cast<decltype(m_map)::value_type*>(_where);
+	// Store previous item if already added
+	spu_item* prev = nullptr;

-	// Function info
-	const std::vector<u32>& func = get_func(_where);
-
-	//
-	const u32 _off = 1 + (func[0] / 4) * (false);
-
-	// Set pointer to the compiled function
-	where.second = compiled;
-
-	// Register function in PIC map
-	m_pic_map[{func.data() + _off, func.size() - _off}] = compiled;
-
-	if (func.size() > 1)
+	//Try to add item that doesn't exist yet
+	const auto ret = m_stuff[data[1] >> 12].push_if([&](spu_item& _new, spu_item& _old)
 	{
-		// Rebuild trampolines if necessary
-		if (const auto new_tr = rebuild_ubertrampoline(func[1]))
-		{
-			g_dispatcher->at(func[1] >> 12) = new_tr;
-		}
-		else
+		std::basic_string_view<u32> lhs{_new.data.data() + 1, _new.data.size() - 1};
+		std::basic_string_view<u32> rhs{_old.data.data() + 1, _old.data.size() - 1};
+
+		if (lhs == rhs)
 		{
+			prev = &_old;
 			return false;
 		}
+
+		return true;
+	}, std::move(data));
+
+	if (ret)
+	{
+		return ret;
 	}

-	// Notify in lock destructor
-	lock.notify = true;
-	return true;
+	return prev;
 }

 spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
 {
 	// Prepare sorted list
-	m_flat_list.clear();
-	{
-		// Select required subrange (fixed 20 bits for single pos in g_dispatcher table)
-		const u32 id_lower = id_inst & ~0xfff;
-		const u32 id_upper = id_inst | 0xfff;
+	static thread_local std::vector<std::pair<std::basic_string_view<u32>, spu_function_t>> m_flat_list;

-		m_flat_list.assign(m_pic_map.lower_bound({&id_lower, 1}), m_pic_map.upper_bound({&id_upper, 1}));
+	// Remember top position
+	auto stuff_it = m_stuff.at(id_inst >> 12).begin();
+	auto stuff_end = m_stuff.at(id_inst >> 12).end();
+	{
+		if (stuff_it->trampoline)
+		{
+			return stuff_it->trampoline;
+		}
+
+		m_flat_list.clear();
+
+		for (auto it = stuff_it; it != stuff_end; ++it)
+		{
+			if (const auto ptr = it->compiled.load())
+			{
+				std::basic_string_view<u32> range{it->data.data() + 1, it->data.size() - 1};
+				m_flat_list.emplace_back(range, ptr);
+			}
+			else
+			{
+				// Pull oneself deeper (TODO)
+				++stuff_it;
+			}
+		}
 	}

+	std::sort(m_flat_list.begin(), m_flat_list.end(), [&](const auto& a, const auto& b)
+	{
+		std::basic_string_view<u32> lhs = a.first;
+		std::basic_string_view<u32> rhs = b.first;
+		return lhs < rhs;
+	});
+
 	struct work
 	{
 		u32 size;
@ -674,6 +679,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
 	const auto _end = m_flat_list.end();
 	const u32 size0 = ::size32(m_flat_list);

+	auto result = beg->second;
+
 	if (size0 != 1)
 	{
 		// Allocate some writable executable memory
@ -944,77 +951,63 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
 		}

 		workload.clear();
-		return reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
+		result = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
 	}

-	// No trampoline required
-	return beg->second;
-}
-
-void* spu_runtime::find(const std::vector<u32>& func)
-{
-	writer_lock lock(*this);
-
-	//
-	const u32 _off = 1 + (func[0] / 4) * (false);
-
-	// Try to find PIC first
-	const auto found = m_pic_map.find({func.data() + _off, func.size() - _off});
-
-	if (found != m_pic_map.end())
+	if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result))
 	{
-		// Wait if already in progress
-		while (!found->second)
+		return _old;
+	}
+
+	// Install ubertrampoline
+	auto& insert_to = spu_runtime::g_dispatcher->at(id_inst >> 12);
+
+	auto _old = insert_to.load();
+
+	do
+	{
+		// Make sure we are replacing an older ubertrampoline but not newer one
+		if (_old != tr_dispatch)
 		{
-			m_cond.wait(m_mutex);
+			bool ok = false;
+
+			for (auto it = stuff_it; it != stuff_end; ++it)
+			{
+				if (it->trampoline == _old)
+				{
+					ok = true;
+					break;
+				}
+			}
+
+			if (!ok)
+			{
+				return result;
+			}
 		}
-
-		// Already compiled
-		return g_dispatcher;
 	}
+	while (!insert_to.compare_exchange(_old, result));

-	// Try to find existing function, register new one if necessary
-	const auto result = m_map.try_emplace(func, nullptr);
-
-	// Add PIC entry as well
-	m_pic_map.try_emplace({result.first->first.data() + _off, result.first->first.size() - _off}, nullptr);
-
-	// Pointer to the value in the map (pair)
-	const auto fn_location = &*result.first;
-
-	if (fn_location->second)
-	{
-		// Already compiled
-		return g_dispatcher;
-	}
-	else if (!result.second)
-	{
-		// Wait if already in progress
-		while (!fn_location->second)
-		{
-			m_cond.wait(m_mutex);
-		}
-
-		return g_dispatcher;
-	}
-
-	// Return location to compile and use in add()
-	return fn_location;
+	return result;
 }

 spu_function_t spu_runtime::find(const u32* ls, u32 addr) const
 {
-	reader_lock lock(this->m_mutex);
-
-	const auto upper = m_pic_map.upper_bound({ls + addr / 4, (0x40000 - addr) / 4});
-
-	if (upper != m_pic_map.begin())
+	for (auto& item : m_stuff.at(ls[addr / 4] >> 12))
 	{
-		const auto found = std::prev(upper);
-
-		if (found->first.compare(0, found->first.size(), ls + addr / 4, found->first.size()) == 0)
+		if (const auto ptr = item.compiled.load())
 		{
-			return found->second;
+			std::basic_string_view<u32> range{item.data.data() + 1, item.data.size() - 1};
+
+			if (addr / 4 + range.size() > 0x10000)
+			{
+				continue;
+			}
+
+			if (range.compare(0, range.size(), ls + addr / 4, range.size()) == 0)
+			{
+				return ptr;
+			}
 		}
 	}

@ -1055,18 +1048,12 @@ spu_function_t spu_runtime::make_branch_patchpoint() const

 spu_recompiler_base::spu_recompiler_base()
 {
-	result.reserve(8192);
 }

 spu_recompiler_base::~spu_recompiler_base()
 {
 }

-void spu_recompiler_base::make_function(const std::vector<u32>& data)
-{
-	compile(data, nullptr);
-}
-
 void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 {
 	// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
@ -1082,7 +1069,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)

 		bytes[0] = 0xe9; // jmp rel32
 		std::memcpy(bytes + 1, &rel, 4);
-		bytes[5] = 0x90;
+		bytes[5] = 0x66; // lnop (2 bytes)
 		bytes[6] = 0x90;
 		bytes[7] = 0x90;

@ -1096,8 +1083,17 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 		return;
 	}

+	spu.jit->init();
+
 	// Compile
-	spu.jit->make_function(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
+	const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
+
+	if (!func)
+	{
+		LOG_FATAL(SPU, "[0x%05x] Compilation failed.", spu.pc);
+		Emu.Pause();
+		return;
+	}

 	// Diagnostic
 	if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
@ -1109,6 +1105,8 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 			LOG_TRACE(SPU, "Called from 0x%x", _info._u32[2] - 4);
 		}
 	}
+
+	spu_runtime::g_tail_escape(&spu, func, nullptr);
 }

 void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
@ -1156,6 +1154,8 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
 	}

 	atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
+
+	spu_runtime::g_tail_escape(&spu, func, rip);
 }

 void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* rip) try
@ -1189,10 +1189,11 @@ catch (const std::exception& e)
 	LOG_NOTICE(GENERAL, "\n%s", spu.dump());
 }

-const std::vector<u32>& spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
+std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
 {
 	// Result: addr + raw instruction data
-	result.clear();
+	std::vector<u32> result;
+	result.reserve(10000);
 	result.push_back(entry_point);

 	// Initialize block entries
@ -3120,7 +3121,7 @@ const std::vector<u32>& spu_recompiler_base::analyse(const be_t<u32>* ls, u32 en
 	return result;
 }

-void spu_recompiler_base::dump(std::string& out)
+void spu_recompiler_base::dump(const std::vector<u32>& result, std::string& out)
 {
 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
 	dis_asm.offset = reinterpret_cast<const u8*>(result.data() + 1);
@ -4153,31 +4154,25 @@ public:
 		}
 	}

-	virtual spu_function_t compile(const std::vector<u32>& func, void* fn_location) override
+	virtual spu_function_t compile(std::vector<u32>&& _func) override
 	{
-		if (func.empty() && m_interp_magn)
+		if (_func.empty() && m_interp_magn)
 		{
 			return compile_interpreter();
 		}

-		if (!fn_location)
-		{
-			fn_location = m_spurt->find(func);
-		}
+		const auto add_loc = m_spurt->add_empty(std::move(_func));

-		if (fn_location == spu_runtime::g_dispatcher)
-		{
-			return &dispatch;
-		}
-
-		if (!fn_location)
+		if (!add_loc)
 		{
 			return nullptr;
 		}

+		const std::vector<u32>& func = add_loc->data;
+
 		std::string log;

-		if (auto cache = g_fxo->get<spu_cache>(); cache && g_cfg.core.spu_cache)
+		if (auto cache = g_fxo->get<spu_cache>(); cache && g_cfg.core.spu_cache && !add_loc->cached.exchange(1))
 		{
 			cache->add(func);
 		}
@ -4206,9 +4201,9 @@ public:
 		const u32 start = m_pos;
 		const u32 end = start + m_size;

-		if (g_cfg.core.spu_debug)
+		if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
 		{
-			this->dump(log);
+			this->dump(func, log);
 			fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
 		}

@ -4745,11 +4740,17 @@ public:
 		// Register function pointer
 		const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));

-		if (!m_spurt->add(fn_location, fn))
+		// Install unconditionally, possibly replacing existing one from spu_fast
+		add_loc->compiled = fn;
+
+		// Rebuild trampoline if necessary
+		if (!m_spurt->rebuild_ubertrampoline(func[1]))
 		{
 			return nullptr;
 		}

+		add_loc->compiled.notify_all();
+
 		if (g_cfg.core.spu_debug)
 		{
 			out.flush();
@ -8236,7 +8237,7 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_llvm_recompiler(u
 struct spu_llvm
 {
 	// Workload
-	lf_queue<std::pair<void*, u8*>> registered;
+	lf_queue<spu_item*> registered;

 	void operator()()
 	{
@ -8259,7 +8260,10 @@ struct spu_llvm
 				continue;
 			}

-			const std::vector<u32>& func = spu_runtime::get_func(parg->first);
+			const std::vector<u32>& func = (*parg)->data;
+
+			// Old function pointer (pre-recompiled)
+			const spu_function_t _old = (*parg)->compiled;

 			// Get data start
 			const u32 start = func[0];
@ -8272,17 +8276,17 @@ struct spu_llvm
 			}

 			// Call analyser
-			const std::vector<u32>& func2 = compiler->analyse(ls.data(), func[0]);
+			std::vector<u32> func2 = compiler->analyse(ls.data(), func[0]);

 			if (func2.size() != size0)
 			{
 				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
 			}

-			if (const auto target = compiler->compile(func, parg->first))
+			if (const auto target = compiler->compile(std::move(func2)))
 			{
 				// Redirect old function (TODO: patch in multiple places)
-				const s64 rel = reinterpret_cast<u64>(target) - reinterpret_cast<u64>(parg->second) - 5;
+				const s64 rel = reinterpret_cast<u64>(target) - reinterpret_cast<u64>(_old) - 5;

 				union
 				{
@ -8296,28 +8300,17 @@ struct spu_llvm
 				bytes[6] = 0x90;
 				bytes[7] = 0x90;

-				atomic_storage<u64>::release(*reinterpret_cast<u64*>(parg->second), result);
+				atomic_storage<u64>::release(*reinterpret_cast<u64*>(_old), result);
 			}
 			else
 			{
-				LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func2[0]);
+				LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func[0]);
+				Emu.Pause();
+				return;
 			}

 			// Clear fake LS
-			for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4)
-			{
-				if (se_storage<u32>::swap(func2[i]) != ls[pos / 4])
-				{
-					LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed at 0x%x", func2[0], pos);
-				}
-
-				ls[pos / 4] = 0;
-			}
-
-			if (func2.size() != size0)
-			{
-				std::memset(ls.data(), 0, 0x40000);
-			}
+			std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
 		}
 	}

@ -8336,27 +8329,26 @@ struct spu_fast : public spu_recompiler_base
 		}
 	}

-	virtual spu_function_t compile(const std::vector<u32>& func, void* fn_location) override
+	virtual spu_function_t compile(std::vector<u32>&& _func) override
 	{
-		if (!fn_location)
-		{
-			fn_location = m_spurt->find(func);
-		}
+		const auto add_loc = m_spurt->add_empty(std::move(_func));

-		if (fn_location == spu_runtime::g_dispatcher)
-		{
-			return &dispatch;
-		}
-
-		if (!fn_location)
+		if (!add_loc)
 		{
 			return nullptr;
 		}

-		if (g_cfg.core.spu_debug)
+		if (add_loc->compiled)
+		{
+			return add_loc->compiled;
+		}
+
+		const std::vector<u32>& func = add_loc->data;
+
+		if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
 		{
 			std::string log;
-			this->dump(log);
+			this->dump(func, log);
 			fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
 		}

@ -8639,15 +8631,29 @@ struct spu_fast : public spu_recompiler_base
 		*raw++ = 0x28;
 		*raw++ = 0xc3;

-		if (!m_spurt->add(fn_location, reinterpret_cast<spu_function_t>(result)))
+		const auto fn = reinterpret_cast<spu_function_t>(result);
+
+		// Install pointer carefully
+		const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
+
+		if (added)
+		{
+			// Send work to LLVM compiler thread
+			g_fxo->get<spu_llvm_thread>()->registered.push(add_loc);
+		}
+
+		// Rebuild trampoline if necessary
+		if (!m_spurt->rebuild_ubertrampoline(func[1]))
 		{
 			return nullptr;
 		}

-		// Send work to LLVM compiler thread; after add() to avoid race
-		g_fxo->get<spu_llvm_thread>()->registered.push(fn_location, result);
+		if (added)
+		{
+			add_loc->compiled.notify_all();
+		}

-		return reinterpret_cast<spu_function_t>(result);
+		return fn;
 	}
 };

--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -1,9 +1,8 @@
 #pragma once

 #include "Utilities/File.h"
-#include "Utilities/mutex.h"
-#include "Utilities/cond.h"
 #include "Utilities/JIT.h"
+#include "Utilities/lockless.h"
 #include "SPUThread.h"
 #include <vector>
 #include <bitset>
@ -37,33 +36,47 @@ public:
 	static void initialize();
 };

+class spu_item
+{
+public:
+	// SPU program
+	const std::vector<u32> data;
+
+	// Compiled function pointer
+	atomic_t<spu_function_t> compiled = nullptr;
+
+	// Ubertrampoline generated for this item when it was latest
+	atomic_t<spu_function_t> trampoline = nullptr;
+
+	atomic_t<u8> cached = false;
+	atomic_t<u8> logged = false;
+
+	spu_item(std::vector<u32>&& data)
+		: data(std::move(data))
+	{
+	}
+
+	spu_item(const spu_item&) = delete;
+
+	spu_item& operator=(const spu_item&) = delete;
+};
+
 // Helper class
 class spu_runtime
 {
-	mutable shared_mutex m_mutex;
-
-	mutable cond_variable m_cond;
-
 	struct func_compare
 	{
 		// Comparison function for SPU programs
 		bool operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const;
 	};

-	// All functions
-	std::map<std::vector<u32>, spu_function_t, func_compare> m_map;
-
-	// All functions as PIC
-	std::map<std::basic_string_view<u32>, spu_function_t> m_pic_map;
+	// All functions (2^20 bunches)
+	std::array<lf_bunch<spu_item>, (1 << 20)> m_stuff;

 	// Debug module output location
 	std::string m_cache_path;

-	// Scratch vector
-	std::vector<std::pair<std::basic_string_view<u32>, spu_function_t>> m_flat_list;
-
 public:
-
 	// Trampoline to spu_recompiler_base::dispatch
 	static const spu_function_t tr_dispatch;

@ -88,23 +101,15 @@ public:
 		return m_cache_path;
 	}

-	// Add compiled function and generate trampoline if necessary
-	bool add(void* where, spu_function_t compiled);
-
-private:
+	// Rebuild ubertrampoline for given identifier (first instruction)
 	spu_function_t rebuild_ubertrampoline(u32 id_inst);

+private:
 	friend class spu_cache;
+
 public:
-
-	// Return opaque pointer for add()
-	void* find(const std::vector<u32>&);
-
-	// Get func from opaque ptr
-	static inline const std::vector<u32>& get_func(void* _where)
-	{
-		return static_cast<decltype(m_map)::value_type*>(_where)->first;
-	}
+	// Return new pointer for add()
+	spu_item* add_empty(std::vector<u32>&&);

 	// Find existing function
 	spu_function_t find(const u32* ls, u32 addr) const;
@ -129,31 +134,6 @@ public:

 	// Interpreter entry point
 	static spu_function_t g_interpreter;
-
-	// Exclusive lock
-	struct writer_lock
-	{
-		spu_runtime& _this;
-		bool notify = false;
-
-		writer_lock(const writer_lock&) = delete;
-
-		writer_lock(spu_runtime& _this)
-			: _this(_this)
-		{
-			_this.m_mutex.lock();
-		}
-
-		~writer_lock()
-		{
-			_this.m_mutex.unlock();
-
-			if (notify)
-			{
-				_this.m_cond.notify_all();
-			}
-		}
-	};
 };

 // SPU Recompiler instance base class
@ -303,9 +283,6 @@ private:
 	// For private use
 	std::vector<u32> workload;

-	// Result of analyse(), to avoid copying and allocation
-	std::vector<u32> result;
-
 public:
 	spu_recompiler_base();

@ -314,11 +291,8 @@ public:
 	// Initialize
 	virtual void init() = 0;

-	// Compile function (may fail)
-	virtual spu_function_t compile(const std::vector<u32>&, void*) = 0;
-
-	// Compile function, handle failure
-	void make_function(const std::vector<u32>&);
+	// Compile function
+	virtual spu_function_t compile(std::vector<u32>&&) = 0;

 	// Default dispatch function fallback (second arg is unused)
 	static void dispatch(spu_thread&, void*, u8* rip);
@ -330,10 +304,10 @@ public:
 	static void old_interpreter(spu_thread&, void* ls, u8*);

 	// Get the function data at specified address
-	const std::vector<u32>& analyse(const be_t<u32>* ls, u32 lsa);
+	std::vector<u32> analyse(const be_t<u32>* ls, u32 lsa);

 	// Print analyser internal state
-	void dump(std::string& out);
+	void dump(const std::vector<u32>& result, std::string& out);

 	// Get SPU Runtime
 	spu_runtime& get_runtime()