SPU: internal refactoring, add spu_program

Use struct (spu_program) instead of std::vector<u32>.
2025-10-02 22:29:14 +00:00 · 2019-11-23 19:30:54 +03:00 · 2019-11-23 19:30:54 +03:00 · 1b9a3e6077
commit 1b9a3e6077
parent 4caf747729
4 changed files with 215 additions and 213 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -45,9 +45,9 @@ void spu_recompiler::init()
 	}
 }
-spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
+spu_function_t spu_recompiler::compile(spu_program&& _func)
 {
-	const u32 start0 = _func[0];
+	const u32 start0 = _func.entry_point;
 	const auto add_loc = m_spurt->add_empty(std::move(_func));
@ -61,9 +61,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 		return add_loc->compiled;
 	}
-	const std::vector<u32>& func = add_loc->data;
+	const spu_program& func = add_loc->data;
-	if (func[0] != start0)
+	if (func.entry_point != start0)
 	{
 		// Wait for the duplicate
 		while (!add_loc->compiled)
@ -84,7 +84,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 		u8 output[20];
 		sha1_starts(&ctx);
-		sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
+		sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
 		sha1_finish(&ctx, output);
 		be_t<u64> hash_start;
@ -168,18 +168,18 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 	u32 words_align = 8;
 	// Start compilation
-	m_pos = func[0];
+	m_pos = func.lower_bound;
-	m_base = func[0];
+	m_base = func.entry_point;
-	m_size = ::size32(func) * 4 - 4;
+	m_size = ::size32(func.data) * 4;
 	const u32 start = m_pos;
 	const u32 end = start + m_size;
 	// Create block labels
-	for (u32 i = 1; i < func.size(); i++)
+	for (u32 i = 0; i < func.data.size(); i++)
 	{
-		if (func[i] && m_block_info[i - 1 + start / 4])
+		if (func.data[i] && m_block_info[i + start / 4])
 		{
-			instr_labels[i * 4 - 4 + start] = c->newLabel();
+			instr_labels[i * 4 + start] = c->newLabel();
 		}
 	}
@ -211,7 +211,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 		for (u32 addr = starta, m = 1; addr < enda && m; addr += 4, m <<= 1)
 		{
 			// Filter out if out of range, or is a hole
-			if (addr >= start && addr < end && func[(addr - start) / 4 + 1])
+			if (addr >= start && addr < end && func.data[(addr - start) / 4])
 			{
 				result |= m;
 			}
@ -226,7 +226,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 	// Skip holes at the beginning (giga only)
 	for (u32 j = start; j < end; j += 4)
 	{
-		if (!func[(j - start) / 4 + 1])
+		if (!func.data[(j - start) / 4])
 		{
 			starta += 4;
 		}
@ -261,7 +261,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 	}
 	else if (m_size == 8)
 	{
-		c->mov(x86::rax, static_cast<u64>(func[2]) << 32 | func[1]);
+		c->mov(x86::rax, static_cast<u64>(func.data[1]) << 32 | func.data[0]);
 		c->cmp(x86::rax, x86::qword_ptr(*ls, *pc0));
 		c->jnz(label_diff);
@ -272,7 +272,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 	}
 	else if (m_size == 4)
 	{
-		c->cmp(x86::dword_ptr(*ls, *pc0), func[1]);
+		c->cmp(x86::dword_ptr(*ls, *pc0), func.data[0]);
 		c->jnz(label_diff);
 		if (utils::has_avx())
@ -351,7 +351,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 			for (u32 i = j; i < j + 64; i += 4)
 			{
-				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
 			}
 			code_off += 64;
@ -391,7 +391,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 			for (u32 i = starta; i < enda; i += 4)
 			{
-				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
 			}
 		}
 		else if (sizea == 2 && (end - start) <= 32)
@ -408,7 +408,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 			for (u32 i = starta; i < starta + 32; i += 4)
 			{
-				words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
+				words.push_back(i >= start ? func.data[(i - start) / 4] : i + 32 < end ? func.data[(i + 32 - start) / 4] : 0);
 			}
 		}
 		else
@ -471,7 +471,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 				for (u32 i = j; i < j + 32; i += 4)
 				{
-					words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
+					words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
 				}
 				code_off += 32;
@ -513,7 +513,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 			for (u32 i = starta; i < enda; i += 4)
 			{
-				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
 			}
 		}
 		else if (sizea == 2 && (end - start) <= 32)
@ -530,7 +530,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 			for (u32 i = starta; i < starta + 32; i += 4)
 			{
-				words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
+				words.push_back(i >= start ? func.data[(i - start) / 4] : i + 32 < end ? func.data[(i + 32 - start) / 4] : 0);
 			}
 		}
 		else
@ -605,7 +605,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 				for (u32 i = j; i < j + 32; i += 4)
 				{
-					words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
+					words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
 				}
 				code_off += 32;
@ -675,10 +675,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 			}
 			// Determine which value will be duplicated at hole positions
-			const u32 w3 = func.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
+			const u32 w3 = func.data.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4);
-			words.push_back(cmask & 1 ? func[(j - start + 0) / 4 + 1] : w3);
+			words.push_back(cmask & 1 ? func.data[(j - start + 0) / 4] : w3);
-			words.push_back(cmask & 2 ? func[(j - start + 4) / 4 + 1] : w3);
+			words.push_back(cmask & 2 ? func.data[(j - start + 4) / 4] : w3);
-			words.push_back(cmask & 4 ? func[(j - start + 8) / 4 + 1] : w3);
+			words.push_back(cmask & 4 ? func.data[(j - start + 8) / 4] : w3);
 			words.push_back(w3);
 			// PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word
@ -771,10 +771,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 		m_pos = -1;
 	}
-	for (u32 i = 1; i < func.size(); i++)
+	for (u32 i = 0; i < func.data.size(); i++)
 	{
-		const u32 pos = start + (i - 1) * 4;
+		const u32 pos = start + i * 4;
-		const u32 op  = se_storage<u32>::swap(func[i]);
+		const u32 op  = std::bit_cast<be_t<u32>>(func.data[i]);
 		if (!op)
 		{
@ -908,7 +908,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
 	const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
 	// Rebuild trampoline if necessary
-	if (!m_spurt->rebuild_ubertrampoline(func[1]))
+	if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
 	{
 		return nullptr;
 	}
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -13,7 +13,7 @@ public:
 	virtual void init() override;
-	virtual spu_function_t compile(std::vector<u32>&&) override;
+	virtual spu_function_t compile(spu_program&&) override;
 private:
 	// ASMJIT runtime
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -291,9 +291,9 @@ spu_cache::~spu_cache()
 {
 }
-std::deque<std::vector<u32>> spu_cache::get()
+std::deque<spu_program> spu_cache::get()
 {
-	std::deque<std::vector<u32>> result;
+	std::deque<spu_program> result;
 	if (!m_file)
 	{
@ -314,41 +314,44 @@ std::deque<std::vector<u32>> spu_cache::get()
 			break;
 		}
-		func.resize(size + 1);
+		func.resize(size);
 		func[0] = addr;
-		if (m_file.read(func.data() + 1, func.size() * 4 - 4) != func.size() * 4 - 4)
+		if (m_file.read(func.data(), func.size() * 4) != func.size() * 4)
 		{
 			break;
 		}
-		if (!size || !func[1])
+		if (!size || !func[0])
 		{
 			// Skip old format Giga entries
 			continue;
 		}
-		result.emplace_front(std::move(func));
+		spu_program res;
 		res.entry_point = addr;
 		res.lower_bound = addr;
 		res.data = std::move(func);
 		result.emplace_front(std::move(res));
 	}
 	return result;
 }
-void spu_cache::add(const std::vector<u32>& func)
+void spu_cache::add(const spu_program& func)
 {
 	if (!m_file)
 	{
 		return;
 	}
-	be_t<u32> size = ::size32(func) - 1;
+	be_t<u32> size = ::size32(func.data);
-	be_t<u32> addr = func[0];
+	be_t<u32> addr = func.entry_point;
 	const fs::iovec_clone gather[3]
 	{
 		{&size, sizeof(size)},
 		{&addr, sizeof(addr)},
-		{func.data() + 1, func.size() * 4 - 4}
+		{func.data.data(), func.data.size() * 4}
 	};
 	// Append data
@ -458,7 +461,7 @@ void spu_cache::initialize()
 		// Build functions
 		for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
 		{
-			const std::vector<u32>& func = std::as_const(func_list)[func_i];
+			const spu_program& func = std::as_const(func_list)[func_i];
 			if (Emu.IsStopped() || fail_flag)
 			{
@ -467,21 +470,21 @@ void spu_cache::initialize()
 			}
 			// Get data start
-			const u32 start = func[0];
+			const u32 start = func.lower_bound;
-			const u32 size0 = ::size32(func);
+			const u32 size0 = ::size32(func.data);
 			// Initialize LS with function data only
-			for (u32 i = 1, pos = start; i < size0; i++, pos += 4)
+			for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
 			{
-				ls[pos / 4] = se_storage<u32>::swap(func[i]);
+				ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
 			}
 			// Call analyser
-			std::vector<u32> func2 = compiler->analyse(ls.data(), func[0]);
+			spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
 			if (func2 != func)
 			{
-				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
+				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
 			}
 			else if (!compiler->compile(std::move(func2)))
 			{
@ -523,51 +526,39 @@ void spu_cache::initialize()
 	g_fxo->init<spu_cache>(std::move(cache));
 }
-bool spu_runtime::func_compare::operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const
+bool spu_program::operator==(const spu_program& rhs) const noexcept
 {
-	if (lhs.empty())
+	// TODO
-		return !rhs.empty();
+	return entry_point - lower_bound == rhs.entry_point - rhs.lower_bound && data == rhs.data;
-	else if (rhs.empty())
+}
 		return false;
-	const u32 lhs_addr = lhs[0];
+bool spu_program::operator<(const spu_program& rhs) const noexcept
-	const u32 rhs_addr = rhs[0];
+{
-
+	const u32 lhs_offs = (entry_point - lower_bound) / 4;
-	if (lhs_addr < rhs_addr)
+	const u32 rhs_offs = (rhs.entry_point - rhs.lower_bound) / 4;
 		return true;
 	else if (lhs_addr > rhs_addr)
 		return false;
 	// Select range for comparison
-	std::basic_string_view<u32> lhs_data(lhs.data() + 1, lhs.size() - 1);
+	std::basic_string_view<u32> lhs_data(data.data() + lhs_offs, data.size() - lhs_offs);
-	std::basic_string_view<u32> rhs_data(rhs.data() + 1, rhs.size() - 1);
+	std::basic_string_view<u32> rhs_data(rhs.data.data() + rhs_offs, rhs.data.size() - rhs_offs);
 	const auto cmp0 = lhs_data.compare(rhs_data);
-	if (lhs_data.empty())
+	if (cmp0 < 0)
-		return !rhs_data.empty();
+		return true;
-	else if (rhs_data.empty())
+	else if (cmp0 > 0)
 		return false;
-	if (false)
+	// Compare from address 0 to the point before the entry point (TODO: undesirable)
-	{
+	lhs_data = {data.data(), lhs_offs};
-		// In Giga mode, compare instructions starting from the entry point first
+	rhs_data = {rhs.data.data(), rhs_offs};
-		lhs_data.remove_prefix(lhs_addr / 4);
+	const auto cmp1 = lhs_data.compare(rhs_data);
 		rhs_data.remove_prefix(rhs_addr / 4);
 		const auto cmp0 = lhs_data.compare(rhs_data);
-		if (cmp0 < 0)
+	if (cmp1 < 0)
-			return true;
+		return true;
-		else if (cmp0 > 0)
+	else if (cmp1 > 0)
-			return false;
+		return false;
-		// Compare from address 0 to the point before the entry point (undesirable)
+	// TODO
-		lhs_data = {lhs.data() + 1, lhs_addr / 4};
+	return lhs_offs < rhs_offs;
 		rhs_data = {rhs.data() + 1, rhs_addr / 4};
 		return lhs_data < rhs_data;
 	}
 	else
 	{
 		return lhs_data < rhs_data;
 	}
 }
 spu_runtime::spu_runtime()
@ -590,9 +581,9 @@ spu_runtime::spu_runtime()
 	}
 }
-spu_item* spu_runtime::add_empty(std::vector<u32>&& data)
+spu_item* spu_runtime::add_empty(spu_program&& data)
 {
-	if (data.size() <= 1)
+	if (data.data.empty())
 	{
 		return nullptr;
 	}
@ -601,12 +592,9 @@ spu_item* spu_runtime::add_empty(std::vector<u32>&& data)
 	spu_item* prev = nullptr;
 	//Try to add item that doesn't exist yet
-	const auto ret = m_stuff[data[1] >> 12].push_if([&](spu_item& _new, spu_item& _old)
+	const auto ret = m_stuff[data.data[0] >> 12].push_if([&](spu_item& _new, spu_item& _old)
 	{
-		std::basic_string_view<u32> lhs{_new.data.data() + 1, _new.data.size() - 1};
+		if (_new.data == _old.data)
 		std::basic_string_view<u32> rhs{_old.data.data() + 1, _old.data.size() - 1};
 		if (lhs == rhs)
 		{
 			prev = &_old;
 			return false;
@ -643,7 +631,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
 		{
 			if (const auto ptr = it->compiled.load())
 			{
-				std::basic_string_view<u32> range{it->data.data() + 1, it->data.size() - 1};
+				std::basic_string_view<u32> range{it->data.data.data(), it->data.data.size()};
 				range.remove_prefix((it->data.entry_point - it->data.lower_bound) / 4);
 				m_flat_list.emplace_back(range, ptr);
 			}
 			else
@ -997,7 +986,8 @@ spu_function_t spu_runtime::find(const u32* ls, u32 addr) const
 	{
 		if (const auto ptr = item.compiled.load())
 		{
-			std::basic_string_view<u32> range{item.data.data() + 1, item.data.size() - 1};
+			std::basic_string_view<u32> range{item.data.data.data(), item.data.data.size()};
 			range.remove_prefix((item.data.entry_point - item.data.lower_bound) / 4);
 			if (addr / 4 + range.size() > 0x10000)
 			{
@ -1194,12 +1184,13 @@ catch (const std::exception& e)
 	LOG_NOTICE(GENERAL, "\n%s", spu.dump());
 }
-std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
+spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
 {
 	// Result: addr + raw instruction data
-	std::vector<u32> result;
+	spu_program result;
-	result.reserve(10000);
+	result.data.reserve(10000);
-	result.push_back(entry_point);
+	result.entry_point = entry_point;
 	result.lower_bound = entry_point;
 	// Initialize block entries
 	m_block_info.reset();
@ -1400,7 +1391,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 			{
 				const u32 target = spu_branch_target(av);
-				LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x%s", result[0], pos, target, op.d ? " (D)" : op.e ? " (E)" : "");
+				LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x%s", entry_point, pos, target, op.d ? " (D)" : op.e ? " (E)" : "");
 				m_targets[pos].push_back(target);
@ -1408,7 +1399,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 				{
 					if (sync)
 					{
-						LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", result[0], pos, sl ? "" : "tail ", target);
+						LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", entry_point, pos, sl ? "" : "tail ", target);
 						if (target > entry_point)
 						{
@ -1505,17 +1496,17 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 					if (jt_abs.size() >= jt_rel.size())
 					{
-						const u32 new_size = (start - lsa) / 4 + 1 + jt_abs.size();
+						const u32 new_size = (start - lsa) / 4 + jt_abs.size();
-						if (result.size() < new_size)
+						if (result.data.size() < new_size)
 						{
-							result.resize(new_size);
+							result.data.resize(new_size);
 						}
 						for (u32 i = 0; i < jt_abs.size(); i++)
 						{
 							add_block(jt_abs[i]);
-							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
+							result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_abs[i]);
 							m_targets[start + i * 4];
 						}
@ -1524,17 +1515,17 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 					if (jt_rel.size() >= jt_abs.size())
 					{
-						const u32 new_size = (start - lsa) / 4 + 1 + jt_rel.size();
+						const u32 new_size = (start - lsa) / 4 + jt_rel.size();
-						if (result.size() < new_size)
+						if (result.data.size() < new_size)
 						{
-							result.resize(new_size);
+							result.data.resize(new_size);
 						}
 						for (u32 i = 0; i < jt_rel.size(); i++)
 						{
 							add_block(jt_rel[i]);
-							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
+							result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_rel[i] - start);
 							m_targets[start + i * 4];
 						}
@ -1569,7 +1560,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 			}
 			else if (type == spu_itype::BI && sync)
 			{
-				LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring indirect branch (SYNC)", result[0], pos);
+				LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos);
 			}
 			if (type == spu_itype::BI || sl)
@ -1630,7 +1621,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 			{
 				if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
 				{
-					LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", result[0], pos, target);
+					LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", entry_point, pos, target);
 				}
 				if (target > entry_point)
@ -1656,7 +1647,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 			{
 				if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
 				{
-					LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", result[0], pos, target);
+					LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", entry_point, pos, target);
 				}
 				if (target > entry_point)
@ -1914,31 +1905,33 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 		}
 		// Insert raw instruction value
-		if (result.size() - 1 <= (pos - lsa) / 4)
+		const u32 new_size = (pos - lsa) / 4;
 		if (result.data.size() <= new_size)
 		{
-			if (result.size() - 1 < (pos - lsa) / 4)
+			if (result.data.size() < new_size)
 			{
-				result.resize((pos - lsa) / 4 + 1);
+				result.data.resize(new_size);
 			}
-			result.emplace_back(se_storage<u32>::swap(data));
+			result.data.emplace_back(std::bit_cast<u32, be_t<u32>>(data));
 		}
-		else if (u32& raw_val = result[(pos - lsa) / 4 + 1])
+		else if (u32& raw_val = result.data[new_size])
 		{
-			verify(HERE), raw_val == se_storage<u32>::swap(data);
+			verify(HERE), raw_val == std::bit_cast<u32, be_t<u32>>(data);
 		}
 		else
 		{
-			raw_val = se_storage<u32>::swap(data);
+			raw_val = std::bit_cast<u32, be_t<u32>>(data);
 		}
 	}
 	while (lsa > 0 || limit < 0x40000)
 	{
-		const u32 initial_size = result.size();
+		const u32 initial_size = result.data.size();
 		// Check unreachable blocks
-		limit = std::min<u32>(limit, lsa + initial_size * 4 - 4);
+		limit = std::min<u32>(limit, lsa + initial_size * 4);
 		for (auto& pair : m_preds)
 		{
@ -1961,7 +1954,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 				for (u32 j = workload[i];; j -= 4)
 				{
 					// Go backward from an address until the entry point is reached
-					if (j == result[0])
+					if (j == entry_point)
 					{
 						reachable = true;
 						break;
@ -1994,7 +1987,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 					// Check for possible fallthrough predecessor
 					if (!had_fallthrough)
 					{
-						if (result.at((j - lsa) / 4) == 0 || m_targets.count(j - 4))
+						if (result.data.at((j - lsa) / 4 - 1) == 0 || m_targets.count(j - 4))
 						{
 							break;
 						}
@ -2018,16 +2011,16 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 			}
 		}
-		result.resize((limit - lsa) / 4 + 1);
+		result.data.resize((limit - lsa) / 4);
 		// Check holes in safe mode (TODO)
 		u32 valid_size = 0;
-		for (u32 i = 1; i < result.size(); i++)
+		for (u32 i = 0; i < result.data.size(); i++)
 		{
-			if (result[i] == 0)
+			if (result.data[i] == 0)
 			{
-				const u32 pos  = lsa + (i - 1) * 4;
+				const u32 pos  = lsa + i * 4;
 				const u32 data = ls[pos / 4];
 				// Allow only NOP or LNOP instructions in holes
@ -2038,34 +2031,34 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 				if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
 				{
-					result.resize(valid_size + 1);
+					result.data.resize(valid_size);
 					break;
 				}
 			}
 			else
 			{
-				valid_size = i;
+				valid_size = i + 1;
 			}
 		}
 		// Even if NOP or LNOP, should be removed at the end
-		result.resize(valid_size + 1);
+		result.data.resize(valid_size);
 		// Repeat if blocks were removed
-		if (result.size() == initial_size)
+		if (result.data.size() == initial_size)
 		{
 			break;
 		}
 	}
-	limit = std::min<u32>(limit, lsa + ::size32(result) * 4 - 4);
+	limit = std::min<u32>(limit, lsa + ::size32(result.data) * 4);
 	// Cleanup block info
 	for (u32 i = 0; i < workload.size(); i++)
 	{
 		const u32 addr = workload[i];
-		if (addr < lsa || addr >= limit || !result[(addr - lsa) / 4 + 1])
+		if (addr < lsa || addr >= limit || !result.data[(addr - lsa) / 4])
 		{
 			m_block_info[addr / 4] = false;
 			m_entry_info[addr / 4] = false;
@ -2104,7 +2097,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 		const u32 prev = (it->first - 4) & 0x3fffc;
 		// TODO: check the correctness
-		if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result[(prev - lsa) / 4 + 1])
+		if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result.data[(prev - lsa) / 4])
 		{
 			// Add target and the predecessor
 			m_targets[prev].push_back(it->first);
@ -2127,25 +2120,25 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 	}
 	// Fill holes which contain only NOP and LNOP instructions (TODO: compile)
-	for (u32 i = 1, nnop = 0, vsize = 0; i <= result.size(); i++)
+	for (u32 i = 0, nnop = 0, vsize = 0; i <= result.data.size(); i++)
 	{
-		if (i >= result.size() || result[i])
+		if (i >= result.data.size() || result.data[i])
 		{
-			if (nnop && nnop == i - vsize - 1)
+			if (nnop && nnop == i - vsize)
 			{
 				// Write only complete NOP sequence
-				for (u32 j = vsize + 1; j < i; j++)
+				for (u32 j = vsize; j < i; j++)
 				{
-					result[j] = se_storage<u32>::swap(ls[lsa / 4 + j - 1]);
+					result.data[j] = std::bit_cast<u32, be_t<u32>>(ls[lsa / 4 + j]);
 				}
 			}
 			nnop  = 0;
-			vsize = i;
+			vsize = i + 1;
 		}
 		else
 		{
-			const u32 pos  = lsa + (i - 1) * 4;
+			const u32 pos  = lsa + i * 4;
 			const u32 data = ls[pos / 4];
 			if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
@ -2169,7 +2162,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 			block.size++;
 			// Decode instruction
-			const spu_opcode_t op{se_storage<u32>::swap(result[(ia - lsa) / 4 + 1])};
+			const spu_opcode_t op{std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 4])};
 			const auto type = s_spu_itype.decode(op.opcode);
@ -2663,7 +2656,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 		for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4)
 		{
 			// Decode instruction again
-			op.opcode = se_storage<u32>::swap(result[(ia - lsa) / 4 + 1]);
+			op.opcode = std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 41]);
 			last_inst = s_spu_itype.decode(op.opcode);
 			// Propagate some constants
@ -3117,24 +3110,18 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
 		}
 	}
-	if (result.size() == 1)
+	if (result.data.empty())
 	{
 		// Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback
 		result.clear();
 	}
 	return result;
 }
-void spu_recompiler_base::dump(const std::vector<u32>& result, std::string& out)
+void spu_recompiler_base::dump(const spu_program& result, std::string& out)
 {
 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
-	dis_asm.offset = reinterpret_cast<const u8*>(result.data() + 1);
+	dis_asm.offset = reinterpret_cast<const u8*>(result.data.data()) - result.lower_bound;
 	if (true)
 	{
 		dis_asm.offset -= result[0];
 	}
 	std::string hash;
 	{
@ -3142,12 +3129,12 @@ void spu_recompiler_base::dump(const std::vector<u32>& result, std::string& out)
 		u8 output[20];
 		sha1_starts(&ctx);
-		sha1_update(&ctx, reinterpret_cast<const u8*>(result.data() + 1), result.size() * 4 - 4);
+		sha1_update(&ctx, reinterpret_cast<const u8*>(result.data.data()), result.data.size() * 4);
 		sha1_finish(&ctx, output);
 		fmt::append(hash, "%s", fmt::base57(output));
 	}
-	fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n", result[0], result.size() - 1, hash);
+	fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n", result.entry_point, result.data.size(), hash);
 	for (auto& bb : m_bbs)
 	{
@ -4162,14 +4149,14 @@ public:
 		}
 	}
-	virtual spu_function_t compile(std::vector<u32>&& _func) override
+	virtual spu_function_t compile(spu_program&& _func) override
 	{
-		if (_func.empty() && m_interp_magn)
+		if (_func.data.empty() && m_interp_magn)
 		{
 			return compile_interpreter();
 		}
-		const u32 start0 = _func[0];
+		const u32 start0 = _func.entry_point;
 		const auto add_loc = m_spurt->add_empty(std::move(_func));
@ -4178,9 +4165,9 @@ public:
 			return nullptr;
 		}
-		const std::vector<u32>& func = add_loc->data;
+		const spu_program& func = add_loc->data;
-		if (func[0] != start0)
+		if (func.entry_point != start0)
 		{
 			// Wait for the duplicate
 			while (!add_loc->compiled)
@ -4203,22 +4190,22 @@ public:
 			u8 output[20];
 			sha1_starts(&ctx);
-			sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
+			sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
 			sha1_finish(&ctx, output);
 			m_hash.clear();
-			fmt::append(m_hash, "spu-0x%05x-%s", func[0], fmt::base57(output));
+			fmt::append(m_hash, "spu-0x%05x-%s", func.entry_point, fmt::base57(output));
 			be_t<u64> hash_start;
 			std::memcpy(&hash_start, output, sizeof(hash_start));
 			m_hash_start = hash_start;
 		}
-		LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, m_hash);
+		LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash);
-		m_pos = func[0];
+		m_pos = func.lower_bound;
-		m_base = func[0];
+		m_base = func.entry_point;
-		m_size = (func.size() - 1) * 4;
+		m_size = ::size32(func.data) * 4;
 		const u32 start = m_pos;
 		const u32 end = start + m_size;
@ -4279,16 +4266,16 @@ public:
 			// Disable check (unsafe)
 			m_ir->CreateBr(label_body);
 		}
-		else if (func.size() - 1 == 1)
+		else if (func.data.size() == 1)
 		{
 			const auto pu32 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type<u32*>());
-			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func[1]));
+			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func.data[0]));
 			m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 		}
-		else if (func.size() - 1 == 2)
+		else if (func.data.size() == 2)
 		{
 			const auto pu64 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type<u64*>());
-			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast<u64>(func[2]) << 32 | func[1]));
+			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast<u64>(func.data[1]) << 32 | func.data[0]));
 			m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 		}
 		else
@ -4298,7 +4285,7 @@ public:
 			// Skip holes at the beginning (giga only)
 			for (u32 j = start; j < end; j += 4)
 			{
-				if (!func[(j - start) / 4 + 1])
+				if (!func.data[(j - start) / 4])
 				{
 					starta += 4;
 				}
@ -4324,7 +4311,7 @@ public:
 				{
 					const u32 k = j + i * 4;
-					if (k < start || k >= end || !func[(k - start) / 4 + 1])
+					if (k < start || k >= end || !func.data[(k - start) / 4])
 					{
 						indices[i] = 8;
 						holes      = true;
@ -4357,7 +4344,7 @@ public:
 				for (u32 i = 0; i < 8; i++)
 				{
 					const u32 k = j + i * 4;
-					words[i] = k >= start && k < end ? func[(k - start) / 4 + 1] : 0;
+					words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
 				}
 				vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words));
@ -4598,7 +4585,7 @@ public:
 						break;
 					}
-					const u32 op = se_storage<u32>::swap(func[(m_pos - start) / 4 + 1]);
+					const u32 op = std::bit_cast<be_t<u32>>(func.data[(m_pos - start) / 4]);
 					if (!op)
 					{
@ -4744,7 +4731,7 @@ public:
 		if (g_cfg.core.spu_debug)
 		{
-			fmt::append(log, "LLVM IR at 0x%x:\n", func[0]);
+			fmt::append(log, "LLVM IR at 0x%x:\n", func.entry_point);
 			out << *module; // print IR
 			out << "\n\n";
 		}
@ -4752,7 +4739,7 @@ public:
 		if (verifyModule(*module, &out))
 		{
 			out.flush();
-			LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func[0], log);
+			LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func.entry_point, log);
 			if (g_cfg.core.spu_debug)
 			{
@ -4781,7 +4768,7 @@ public:
 		add_loc->compiled = fn;
 		// Rebuild trampoline if necessary
-		if (!m_spurt->rebuild_ubertrampoline(func[1]))
+		if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
 		{
 			return nullptr;
 		}
@ -8398,7 +8385,7 @@ struct spu_llvm
 			}
 			// Start compiling
-			const std::vector<u32>& func = found_it->second->data;
+			const spu_program& func = found_it->second->data;
 			// Old function pointer (pre-recompiled)
 			const spu_function_t _old = found_it->second->compiled;
@ -8407,21 +8394,21 @@ struct spu_llvm
 			enqueued.erase(found_it);
 			// Get data start
-			const u32 start = func[0];
+			const u32 start = func.lower_bound;
-			const u32 size0 = ::size32(func);
+			const u32 size0 = ::size32(func.data);
 			// Initialize LS with function data only
-			for (u32 i = 1, pos = start; i < size0; i++, pos += 4)
+			for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
 			{
-				ls[pos / 4] = se_storage<u32>::swap(func[i]);
+				ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
 			}
 			// Call analyser
-			std::vector<u32> func2 = compiler->analyse(ls.data(), func[0]);
+			spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
 			if (func2 != func)
 			{
-				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
+				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
 			}
 			else if (const auto target = compiler->compile(std::move(func2)))
 			{
@ -8444,7 +8431,7 @@ struct spu_llvm
 			}
 			else
 			{
-				LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func[0]);
+				LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func.entry_point);
 				Emu.Pause();
 				return;
 			}
@ -8469,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base
 		}
 	}
-	virtual spu_function_t compile(std::vector<u32>&& _func) override
+	virtual spu_function_t compile(spu_program&& _func) override
 	{
 		const auto add_loc = m_spurt->add_empty(std::move(_func));
@ -8483,7 +8470,7 @@ struct spu_fast : public spu_recompiler_base
 			return add_loc->compiled;
 		}
-		const std::vector<u32>& func = add_loc->data;
+		const spu_program& func = add_loc->data;
 		if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
 		{
@ -8493,22 +8480,22 @@ struct spu_fast : public spu_recompiler_base
 		}
 		// Allocate executable area with necessary size
-		const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
+		const auto result = jit_runtime::alloc(22 + 1 + 9 + ::size32(func.data) * (16 + 16) + 36 + 47, 16);
 		if (!result)
 		{
 			return nullptr;
 		}
-		m_pos = func[0];
+		m_pos = func.lower_bound;
-		m_size = (::size32(func) - 1) * 4;
+		m_size = ::size32(func.data) * 4;
 		{
 			sha1_context ctx;
 			u8 output[20];
 			sha1_starts(&ctx);
-			sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
+			sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
 			sha1_finish(&ctx, output);
 			be_t<u64> hash_start;
@ -8554,9 +8541,9 @@ struct spu_fast : public spu_recompiler_base
 		*raw++ = 0x00;
 		// Verification (slow)
-		for (u32 i = 1; i < func.size(); i++)
+		for (u32 i = 0; i < func.data.size(); i++)
 		{
-			if (!func[i])
+			if (!func.data[i])
 			{
 				continue;
 			}
@ -8564,8 +8551,8 @@ struct spu_fast : public spu_recompiler_base
 			// cmp dword ptr [rcx + off], opc
 			*raw++ = 0x81;
 			*raw++ = 0xb9;
-			const u32 off = (i - 1) * 4;
+			const u32 off = i * 4;
-			const u32 opc = func[i];
+			const u32 opc = func.data[i];
 			std::memcpy(raw + 0, &off, 4);
 			std::memcpy(raw + 4, &opc, 4);
 			raw += 8;
@ -8627,16 +8614,16 @@ struct spu_fast : public spu_recompiler_base
 		*raw++ = 0x4c;
 		*raw++ = 0x8d;
 		*raw++ = 0x35;
-		const u32 epi_off = (::size32(func) - 1) * 16;
+		const u32 epi_off = ::size32(func.data) * 16;
 		std::memcpy(raw, &epi_off, 4);
 		raw += 4;
 		// Instructions (each instruction occupies fixed number of bytes)
-		for (u32 i = 1; i < func.size(); i++)
+		for (u32 i = 0; i < func.data.size(); i++)
 		{
-			const u32 pos = m_pos + (i - 1) * 4;
+			const u32 pos = m_pos + i * 4;
-			if (!func[i])
+			if (!func.data[i])
 			{
 				// Save pc: mov [rbp + spu_thread::pc], r12d
 				*raw++ = 0x44;
@ -8658,7 +8645,7 @@ struct spu_fast : public spu_recompiler_base
 			}
 			// Fix endianness
-			const spu_opcode_t op{se_storage<u32>::swap(func[i])};
+			const spu_opcode_t op{std::bit_cast<be_t<u32>>(func.data[i])};
 			switch (auto type = s_spu_itype.decode(op.opcode))
 			{
@ -8797,7 +8784,7 @@ struct spu_fast : public spu_recompiler_base
 		}
 		// Rebuild trampoline if necessary
-		if (!m_spurt->rebuild_ubertrampoline(func[1]))
+		if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
 		{
 			return nullptr;
 		}
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -29,18 +29,39 @@ public:
 		return m_file.operator bool();
 	}
-	std::deque<std::vector<u32>> get();
+	std::deque<struct spu_program> get();
-	void add(const std::vector<u32>& func);
+	void add(const struct spu_program& func);
 	static void initialize();
 };
 struct spu_program
 {
 	// Address of the entry point in LS
 	u32 entry_point;
 	// Address of the data in LS
 	u32 lower_bound;
 	// Program data with intentionally wrong endianness (on LE platform opcode values are swapped)
 	std::vector<u32> data;
 	bool operator==(const spu_program& rhs) const noexcept;
 	bool operator!=(const spu_program& rhs) const noexcept
 	{
 		return !(*this == rhs);
 	}
 	bool operator<(const spu_program& rhs) const noexcept;
 };
 class spu_item
 {
 public:
 	// SPU program
-	const std::vector<u32> data;
+	const spu_program data;
 	// Compiled function pointer
 	atomic_t<spu_function_t> compiled = nullptr;
@ -51,7 +72,7 @@ public:
 	atomic_t<u8> cached = false;
 	atomic_t<u8> logged = false;
-	spu_item(std::vector<u32>&& data)
+	spu_item(spu_program&& data)
 		: data(std::move(data))
 	{
 	}
@ -64,12 +85,6 @@ public:
 // Helper class
 class spu_runtime
 {
 	struct func_compare
 	{
 		// Comparison function for SPU programs
 		bool operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const;
 	};
 	// All functions (2^20 bunches)
 	std::array<lf_bunch<spu_item>, (1 << 20)> m_stuff;
@ -109,7 +124,7 @@ private:
 public:
 	// Return new pointer for add()
-	spu_item* add_empty(std::vector<u32>&&);
+	spu_item* add_empty(spu_program&&);
 	// Find existing function
 	spu_function_t find(const u32* ls, u32 addr) const;
@ -292,7 +307,7 @@ public:
 	virtual void init() = 0;
 	// Compile function
-	virtual spu_function_t compile(std::vector<u32>&&) = 0;
+	virtual spu_function_t compile(spu_program&&) = 0;
 	// Default dispatch function fallback (second arg is unused)
 	static void dispatch(spu_thread&, void*, u8* rip);
@ -304,10 +319,10 @@ public:
 	static void old_interpreter(spu_thread&, void* ls, u8*);
 	// Get the function data at specified address
-	std::vector<u32> analyse(const be_t<u32>* ls, u32 lsa);
+	spu_program analyse(const be_t<u32>* ls, u32 lsa);
 	// Print analyser internal state
-	void dump(const std::vector<u32>& result, std::string& out);
+	void dump(const spu_program& result, std::string& out);
 	// Get SPU Runtime
 	spu_runtime& get_runtime()