Merge branch 'master' into master

2025-04-20 11:36:13 +00:00 · 2025-02-02 16:40:08 +13:00 · 2025-02-02 16:40:08 +13:00 · f7dd259ab4
commit f7dd259ab4
parent a4a45f0338 911f0928cf
5 changed files with 187 additions and 78 deletions
--- a/rpcs3/Emu/Cell/PPUAnalyser.cpp
+++ b/rpcs3/Emu/Cell/PPUAnalyser.cpp
@ -1177,6 +1177,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 				func.size = 0x1C;
 				func.blocks.emplace(func.addr, func.size);
 				func.attr += ppu_attr::known_size;
+				known_functions.emplace(func.addr);

 				// Look for another imports to fill gaps (hack)
 				auto _p2 = _ptr + 7;
@ -1195,6 +1196,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 					next.size = 0x1C;
 					next.blocks.emplace(next.addr, next.size);
 					next.attr += ppu_attr::known_size;
+					known_functions.emplace(_p2.addr());
 					advance(_p2, p2, 7);
 				}

@ -1213,9 +1215,8 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 				// Trampoline with TOC
 				const u32 target = (ptr[3] << 16) + s16(ptr[4]);
 				const u32 toc_add = (ptr[1] << 16) + s16(ptr[2]);
-				constexpr u32 func_size = 0x1C;

-				if (target >= start && target < end && verify_ref((_ptr + 3).addr()) && target - func.addr >= func_size)
+				if (target >= start && target < end && verify_ref((_ptr + 3).addr()))
 				{
 					auto& new_func = add_func(target, 0, func.addr);

@ -1774,8 +1775,23 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con

 	u32 per_instruction_bytes = 0;

-	for (auto&& [_, func] : fmap)
+	// Iterate by address (fmap may grow)
+	for (u32 addr_next = start; addr_next != end;)
 	{
+		// Get next iterator
+		const auto it = fmap.lower_bound(addr_next);
+
+		if (it == fmap.end())
+		{
+			break;
+		}
+
+		// Save next function address as is as of this moment (ignoring added functions)
+		const auto it_next = std::next(it);
+		addr_next = it_next == fmap.end() ? end : it_next->first;
+
+		const ppu_function_ext& func = it->second;
+
 		if (func.attr & ppu_attr::no_size && entry)
 		{
 			// Disabled for PRX for now
@ -1793,6 +1809,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 			}

 			per_instruction_bytes += utils::sub_saturate<u32>(lim, func.addr);
+			addr_next = std::max<u32>(addr_next, lim);
 			continue;
 		}

@ -1814,7 +1831,7 @@ bool ppu_module<lv2_obj>::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con
 			block.addr = addr;
 			block.size = size;
 			block.toc  = func.toc;
-			ppu_log.trace("Block __0x%x added (func=0x%x, size=0x%x, toc=0x%x)", block.addr, _, block.size, block.toc);
+			ppu_log.trace("Block __0x%x added (func=0x%x, size=0x%x, toc=0x%x)", block.addr, it->first, block.size, block.toc);

 			if (!entry && !sec_end)
 			{
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -1652,7 +1652,7 @@ public:
 			u32 elements;
 			u32 dwords;

-			if (m_use_avx512 && g_cfg.core.full_width_avx512)
+			if (m_use_avx512)
 			{
 				stride = 64;
 				elements = 16;
@ -1677,94 +1677,175 @@ public:

 			llvm::Value* acc = nullptr;

-			for (u32 j = starta; j < end; j += stride)
+			// Use a 512bit simple checksum to verify integrity if size is atleast 512b * 3
+			// This code uses a 512bit vector for all hardware to ensure behavior matches.
+			// The checksum path is still faster even on narrow hardware.
+			if ((end - starta) >= 192 && !g_cfg.core.precise_spu_verification)
 			{
-				int indices[16];
-				bool holes = false;
-				bool data = false;
-
-				for (u32 i = 0; i < elements; i++)
+				for (u32 j = starta; j < end; j += 64)
 				{
-					const u32 k = j + i * 4;
+					int indices[16];
+					bool holes = false;
+					bool data = false;

-					if (k < start || k >= end || !func.data[(k - start) / 4])
+					for (u32 i = 0; i < 16; i++)
 					{
-						indices[i] = elements;
-						holes      = true;
+						const u32 k = j + i * 4;
+
+						if (k < start || k >= end || !func.data[(k - start) / 4])
+						{
+							indices[i] = 16;
+							holes      = true;
+						}
+						else
+						{
+							indices[i] = i;
+							data       = true;
+						}
 					}
-					else
+
+					if (!data)
 					{
-						indices[i] = i;
-						data       = true;
+						// Skip full-sized holes
+						continue;
 					}
-				}

-				if (!data)
-				{
-					// Skip full-sized holes
-					continue;
-				}
+					llvm::Value* vls = nullptr;

-				llvm::Value* vls = nullptr;
-
-				// Load unaligned code block from LS
-				if (m_use_avx512 && g_cfg.core.full_width_avx512)
-				{
+					// Load unaligned code block from LS
 					vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
+
+					// Mask if necessary
+					if (holes)
+					{
+						vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, 16));
+					}
+
+					acc = acc ? m_ir->CreateAdd(acc, vls) : vls;
+					check_iterations++;
 				}
-				else if (m_use_avx)
+
+				// Create the checksum
+				u32 checksum[16] = {0};
+
+				for (u32 j = 0; j < func.data.size(); j += 16) // Process 16 elements per iteration
 				{
-					vls = m_ir->CreateAlignedLoad(get_type<u32[8]>(), _ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
-				}
-				else
-				{
-					vls = m_ir->CreateAlignedLoad(get_type<u32[4]>(), _ptr<u32[4]>(data_addr, j - starta), llvm::MaybeAlign{4});
+					for (u32 i = 0; i < 16; i++)
+					{
+						if (j + i < func.data.size())
+						{
+							checksum[i] += func.data[j + i];
+						}
+					}
 				}

-				// Mask if necessary
-				if (holes)
-				{
-					vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements));
-				}
+				auto* const_vector = ConstantDataVector::get(m_context, llvm::ArrayRef(checksum, 16));
+				acc = m_ir->CreateXor(acc, const_vector);

-				// Perform bitwise comparison and accumulate
-				u32 words[16];
-
-				for (u32 i = 0; i < elements; i++)
-				{
-					const u32 k = j + i * 4;
-					words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
-				}
-
-				vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements)));
-				acc = acc ? m_ir->CreateOr(acc, vls) : vls;
-				check_iterations++;
-			}
-
-			// Pattern for PTEST
-			if (m_use_avx512 && g_cfg.core.full_width_avx512)
-			{
+				// Pattern for PTEST
 				acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
-			}
-			else if (m_use_avx)
-			{
-				acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
+
+				llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
+
+				for (u32 i = 1; i < 8; i++)
+				{
+					elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
+				}
+
+				// Compare result with zero
+				const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
+				m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 			}
 			else
 			{
-				acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
+				for (u32 j = starta; j < end; j += stride)
+				{
+					int indices[16];
+					bool holes = false;
+					bool data = false;
+
+					for (u32 i = 0; i < elements; i++)
+					{
+						const u32 k = j + i * 4;
+
+						if (k < start || k >= end || !func.data[(k - start) / 4])
+						{
+							indices[i] = elements;
+							holes      = true;
+						}
+						else
+						{
+							indices[i] = i;
+							data       = true;
+						}
+					}
+
+					if (!data)
+					{
+						// Skip full-sized holes
+						continue;
+					}
+
+					llvm::Value* vls = nullptr;
+
+					// Load unaligned code block from LS
+					if (m_use_avx512)
+					{
+						vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
+					}
+					else if (m_use_avx)
+					{
+						vls = m_ir->CreateAlignedLoad(get_type<u32[8]>(), _ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
+					}
+					else
+					{
+						vls = m_ir->CreateAlignedLoad(get_type<u32[4]>(), _ptr<u32[4]>(data_addr, j - starta), llvm::MaybeAlign{4});
+					}
+
+					// Mask if necessary
+					if (holes)
+					{
+						vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements));
+					}
+
+					// Perform bitwise comparison and accumulate
+					u32 words[16];
+
+					for (u32 i = 0; i < elements; i++)
+					{
+						const u32 k = j + i * 4;
+						words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
+					}
+
+					vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements)));
+					acc = acc ? m_ir->CreateOr(acc, vls) : vls;
+					check_iterations++;
+				}
+				// Pattern for PTEST
+				if (m_use_avx512)
+				{
+					acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
+				}
+				else if (m_use_avx)
+				{
+					acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
+				}
+				else
+				{
+					acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
+				}
+
+				llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
+
+				for (u32 i = 1; i < dwords; i++)
+				{
+					elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
+				}
+
+				// Compare result with zero
+				const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
+				m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 			}
-
-			llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
-
-			for (u32 i = 1; i < dwords; i++)
-			{
-				elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
-			}
-
-			// Compare result with zero
-			const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
-			m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 		}

 		// Increase block counter with statistics
--- a/rpcs3/Emu/Cell/lv2/sys_time.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_time.cpp
@ -147,7 +147,7 @@ u64 convert_to_timebased_time(u64 time)

 u64 get_timebased_time()
 {
-	if (0) if (u64 freq = utils::get_tsc_freq())
+	if (u64 freq = utils::get_tsc_freq())
 	{
 		const u64 tsc = utils::get_tsc();

@ -207,7 +207,7 @@ void initialize_timebased_time(u64 timebased_init, bool reset)
 // Returns some relative time in microseconds, don't change this fact
 u64 get_system_time()
 {
-	if (0) if (u64 freq = utils::get_tsc_freq())
+	if (u64 freq = utils::get_tsc_freq())
 	{
 		const u64 tsc = utils::get_tsc();

--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@ -68,7 +68,7 @@ struct cfg_root : cfg::node
 		cfg::_enum<xfloat_accuracy> spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false };
 		cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
 		cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
-		cfg::_bool full_width_avx512{ this, "Full Width AVX-512", true };
+		cfg::_bool precise_spu_verification{ this, "Precise SPU Verification", false }; // Disables use of xorsum based spu verification if enabled.
 		cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM
 		cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
 		cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.
--- a/rpcs3/util/sysinfo.cpp
+++ b/rpcs3/util/sysinfo.cpp
@ -524,7 +524,7 @@ std::string utils::get_system_info()
 	}
 	else
 	{
-		fmt::append(result, " | TSC: Bad");
+		fmt::append(result, " | TSC: Disabled");
 	}

 	if (has_avx())
@ -772,15 +772,26 @@ static const bool s_tsc_freq_evaluated = []() -> bool
 #endif

 		if (!utils::has_invariant_tsc())
+		{
 			return 0;
+		}
+
+		if (utils::get_cpu_brand().find("Ryzen") != umax)
+		{
+			return 0;
+		}

 #ifdef _WIN32
 		LARGE_INTEGER freq;
 		if (!QueryPerformanceFrequency(&freq))
+		{
 			return 0;
+		}

 		if (freq.QuadPart <= 9'999'999)
+		{
 			return 0;
+		}

 		const ullong timer_freq = freq.QuadPart;
 #else
@ -880,7 +891,7 @@ static const bool s_tsc_freq_evaluated = []() -> bool
 		return round_tsc(res, utils::mul_saturate<u64>(utils::add_saturate<u64>(rdtsc_diff[0], rdtsc_diff[1]), utils::aligned_div(timer_freq, timer_data[1] - timer_data[0])));
 	}();

-	atomic_storage<u64>::release(utils::s_tsc_freq, cal_tsc);
+	atomic_storage<u64>::store(utils::s_tsc_freq, cal_tsc);
 	return true;
 }();