SPU DMA: more tuning for mov_rdata_avx

Avoid unaligned stores. Prefer asm path if __AVX2__ is not set. Don't emit vzeroupper if __AVX__ is set.
2025-04-20 19:45:20 +00:00 · 2020-04-27 17:46:15 +03:00 · 2020-04-27 17:46:15 +03:00 · 3ec73b651e
commit 3ec73b651e
parent 4f71c570bd
2 changed files with 45 additions and 8 deletions
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -43,11 +43,12 @@ static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const

 static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
 {
-#if defined(_MSC_VER) || defined(__AVX__)
-	_mm256_storeu_si256(dst + 0, _mm256_loadu_si256(src + 0));
-	_mm256_storeu_si256(dst + 1, _mm256_loadu_si256(src + 1));
-	_mm256_storeu_si256(dst + 2, _mm256_loadu_si256(src + 2));
-	_mm256_storeu_si256(dst + 3, _mm256_loadu_si256(src + 3));
+#if defined(_MSC_VER) || defined(__AVX2__)
+	// In AVX-only mode, for some older CPU models, GCC/Clang may emit 128-bit loads/stores instead.
+	_mm256_store_si256(dst + 0, _mm256_loadu_si256(src + 0));
+	_mm256_store_si256(dst + 1, _mm256_loadu_si256(src + 1));
+	_mm256_store_si256(dst + 2, _mm256_loadu_si256(src + 2));
+	_mm256_store_si256(dst + 3, _mm256_loadu_si256(src + 3));
 #else
 	__asm__(
 		"vmovdqu 0*32(%[src]), %%ymm0;" // load
@ -58,11 +59,17 @@ static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
 		"vmovdqu %%ymm0, 2*32(%[dst]);"
 		"vmovdqu 3*32(%[src]), %%ymm0;"
 		"vmovdqu %%ymm0, 3*32(%[dst]);"
-		"vzeroupper"
+#ifndef __AVX__
+		"vzeroupper" // Don't need in AVX mode (should be emitted automatically)
+#endif
 		:
 		: [src] "r" (src)
 		, [dst] "r" (dst)
-		: "xmm0"
+#ifdef __AVX__
+		: "ymm0" // Clobber ymm0 register (acknowledge its modification)
+#else
+		: "xmm0" // ymm0 is "unknown" if not compiled in AVX mode, so clobber xmm0 only
+#endif
 	);
 #endif
 }
@ -1512,6 +1519,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				// Split locking + transfer in two parts (before 64K border, and after it)
 				const auto lock = vm::range_lock(range_addr, nexta);

+				// Avoid unaligned stores in mov_rdata_avx
+				if (reinterpret_cast<u64>(dst) & 0x10)
+				{
+					*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+					dst += 16;
+					src += 16;
+					size0 -= 16;
+				}
+
 				while (size0 >= 128)
 				{
 					mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
@ -1536,6 +1553,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)

 			const auto lock = vm::range_lock(range_addr, range_end);

+			// Avoid unaligned stores in mov_rdata_avx
+			if (reinterpret_cast<u64>(dst) & 0x10)
+			{
+				*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+				dst += 16;
+				src += 16;
+				size -= 16;
+			}
+
 			while (size >= 128)
 			{
 				mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
@ -1586,6 +1613,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 	}
 	default:
 	{
+		// Avoid unaligned stores in mov_rdata_avx
+		if (reinterpret_cast<u64>(dst) & 0x10)
+		{
+			*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+			dst += 16;
+			src += 16;
+			size -= 16;
+		}
+
 		while (size >= 128)
 		{
 			mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -556,7 +556,7 @@ public:

 	// Reservation Data
 	u64 rtime = 0;
-	std::array<v128, 8> rdata{};
+	alignas(64) std::array<v128, 8> rdata{};
 	u32 raddr = 0;

 	u32 srr0;