RSX: Use AVX-512-ICL code in get_vertecx_program_ucode_hash and in vertex_program_compare

- Code is about 4 times as fast on my zen4 machine - Should be twice as fast on zen5 machines with full width AVX-512
2025-10-01 05:39:06 +00:00 · 2025-02-28 08:45:34 -05:00 · 2025-02-28 08:45:34 -05:00 · 8f87baab6c
commit 8f87baab6c
parent 4e08242074
1 changed files with 186 additions and 19 deletions
--- a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp
+++ b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp
@ -1,6 +1,7 @@
 #include "stdafx.h"
 #include "ProgramStateCache.h"
 #include "Emu/system_config.h"
+#include "util/sysinfo.hpp"

 #include <stack>

@ -21,31 +22,119 @@
 #endif
 #endif

+#ifdef ARCH_ARM64
+#define AVX512_ICL_FUNC
+#endif
+
+#ifdef _MSC_VER
+#define AVX512_ICL_FUNC
+#else
+#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq")))
+#endif
+
+
 using namespace program_hash_util;

-usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program)
+AVX512_ICL_FUNC usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program)
 {
-	// Checksum as hash with rotated data
-	const void* instbuffer = program.data.data();
-	u32 instIndex = 0;
-	usz acc0 = 0;
-	usz acc1 = 0;
-
-	do
+#ifdef ARCH_X64
+	if (utils::has_avx512_icl())
 	{
-		if (program.instruction_mask[instIndex])
+		// Load all elements of the instruction_mask bitset
+		const __m512i* instMask512 = reinterpret_cast<const __m512i*>(&program.instruction_mask);
+		const __m128i* instMask128 = reinterpret_cast<const __m128i*>(&program.instruction_mask);
+
+		const __m512i lowerMask = _mm512_loadu_si512(instMask512);
+		const __m128i upper128 = _mm_loadu_si128(instMask128 + 4);
+		const __m512i upperMask = _mm512_zextsi128_si512(upper128);
+		
+		__m512i maskIndex = _mm512_setzero_si512();
+		const __m512i negativeOnes = _mm512_set1_epi64(-1);
+
+		// Special masks to test against bitset 
+		const __m512i testMask0 = _mm512_set_epi64(
+		0x0808080808080808,
+		0x0808080808080808,
+		0x0404040404040404,
+		0x0404040404040404,
+		0x0202020202020202,
+		0x0202020202020202,
+		0x0101010101010101,
+		0x0101010101010101);
+
+		const __m512i testMask1 = _mm512_set_epi64(
+		0x8080808080808080,
+		0x8080808080808080,
+		0x4040404040404040,
+		0x4040404040404040,
+		0x2020202020202020,
+		0x2020202020202020,
+		0x1010101010101010,
+		0x1010101010101010);
+
+		const __m512i* instBuffer = reinterpret_cast<const __m512i*>(program.data.data());
+		__m512i acc0 = _mm512_setzero_si512();
+		__m512i acc1 = _mm512_setzero_si512();
+
+		__m512i rotMask0 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+		__m512i rotMask1 = _mm512_set_epi64(15, 14, 13, 12, 11, 10, 9, 8);
+		__m512i rotMaskAdd = _mm512_set_epi64(16, 16, 16, 16, 16, 16, 16, 16);
+
+		u32 instIndex = 0;
+
+		// If there is remainder, add an extra (masked) iteration
+		u32 extraIteration = (program.data.size() % 32 != 0) ? 1 : 0;
+		u32 length = (program.data.size() / 32) + extraIteration;
+
+		// The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop
+		// for the remainder, or a scalar loop.
+		while (instIndex < (length))
 		{
-			const auto inst = v128::loadu(instbuffer, instIndex);
-			usz tmp0 = std::rotr(inst._u64[0], instIndex * 2);
-			acc0 += tmp0;
-			usz tmp1 = std::rotr(inst._u64[1], (instIndex * 2) + 1);
-			acc1 += tmp1;
+			const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask);
+			const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0);
+			const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1);
+			const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer + instIndex * 2));
+			const __m512i load1 = _mm512_maskz_loadu_epi64(result1, (instBuffer + (instIndex * 2)+ 1));
+
+			const __m512i rotated0 = _mm512_rorv_epi64(load0, rotMask0);
+			const __m512i rotated1 = _mm512_rorv_epi64(load1, rotMask1);
+
+			acc0 = _mm512_add_epi64(acc0, rotated0);
+			acc1 = _mm512_add_epi64(acc1, rotated1);
+
+			rotMask0 = _mm512_add_epi64(rotMask0, rotMaskAdd);
+			rotMask1 = _mm512_add_epi64(rotMask1, rotMaskAdd);
+			maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes);
+
+			instIndex++;
 		}

-		instIndex++;
-	} while (instIndex < (program.data.size() / 4));
+		const __m512i result = _mm512_add_epi64(acc0, acc1);
+		return _mm512_reduce_add_epi64(result);
+	}
+#endif
+
+ 	// Checksum as hash with rotated data
+ 	const void* instbuffer = program.data.data();
+ 	u32 instIndex = 0;
+ 	usz acc0 = 0;
+ 	usz acc1 = 0;
+
+ 	do
+ 	{
+ 		if (program.instruction_mask[instIndex])
+ 		{
+ 			const auto inst = v128::loadu(instbuffer, instIndex);
+ 			usz tmp0 = std::rotr(inst._u64[0], instIndex * 2);
+ 			acc0 += tmp0;
+ 			usz tmp1 = std::rotr(inst._u64[1], (instIndex * 2) + 1);
+ 			acc1 += tmp1;
+ 		}
+
+ 		instIndex++;
+ 	} while (instIndex < (program.data.size() / 4));
 	return acc0 + acc1;
-}
+ }

 vertex_program_utils::vertex_program_metadata vertex_program_utils::analyse_vertex_program(const u32* data, u32 entry, RSXVertexProgram& dst_prog)
 {
@ -350,7 +439,7 @@ usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) con
 	return rpcs3::hash64(ucode_hash, metadata_hash);
 }

-bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const
+AVX512_ICL_FUNC bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const
 {
 	if (binary1.output_mask != binary2.output_mask)
 		return false;
@ -363,10 +452,88 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R
 	if (binary1.jump_table != binary2.jump_table)
 		return false;

+#ifdef ARCH_X64
+	if (utils::has_avx512_icl())
+	{
+		// Load all elements of the instruction_mask bitset
+		const __m512i* instMask512 = reinterpret_cast<const __m512i*>(&binary1.instruction_mask);
+		const __m128i* instMask128 = reinterpret_cast<const __m128i*>(&binary1.instruction_mask);
+
+		const __m512i lowerMask = _mm512_loadu_si512(instMask512);
+		const __m128i upper128 = _mm_loadu_si128(instMask128 + 4);
+		const __m512i upperMask = _mm512_zextsi128_si512(upper128);
+		
+		__m512i maskIndex = _mm512_setzero_si512();
+		const __m512i negativeOnes = _mm512_set1_epi64(-1);
+
+		// Special masks to test against bitset 
+		const __m512i testMask0 = _mm512_set_epi64(
+		0x0808080808080808,
+		0x0808080808080808,
+		0x0404040404040404,
+		0x0404040404040404,
+		0x0202020202020202,
+		0x0202020202020202,
+		0x0101010101010101,
+		0x0101010101010101);
+
+		const __m512i testMask1 = _mm512_set_epi64(
+		0x8080808080808080,
+		0x8080808080808080,
+		0x4040404040404040,
+		0x4040404040404040,
+		0x2020202020202020,
+		0x2020202020202020,
+		0x1010101010101010,
+		0x1010101010101010);
+
+		const __m512i* instBuffer1 = reinterpret_cast<const __m512i*>(binary1.data.data());
+		const __m512i* instBuffer2 = reinterpret_cast<const __m512i*>(binary2.data.data());
+
+		// If there is remainder, add an extra (masked) iteration
+		u32 extraIteration = (binary1.data.size() % 32 != 0) ? 1 : 0;
+		u32 length = (binary1.data.size() / 32) + extraIteration;
+
+		u32 instIndex = 0;
+
+		// The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop
+		// for the remainder, or a scalar loop.
+		while (instIndex < (length))
+		{
+			const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask);
+
+			const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0);
+			const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1);
+
+			const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer1 + (instIndex * 2)));
+			const __m512i load1 = _mm512_maskz_loadu_epi64(result0, (instBuffer2 + (instIndex * 2)));
+			const __m512i load2 = _mm512_maskz_loadu_epi64(result1, (instBuffer1 + (instIndex * 2) + 1));
+			const __m512i load3 = _mm512_maskz_loadu_epi64(result1, (instBuffer2 + (instIndex * 2)+ 1));
+
+			const __mmask8 res0 = _mm512_cmpneq_epi64_mask(load0, load1);
+			const __mmask8 res1 = _mm512_cmpneq_epi64_mask(load2, load3);
+
+			const u8 result = _kortestz_mask8_u8(res0, res1);
+
+			//kortestz will set result to 1 if all bits are zero, so invert the check for result
+			if (!result)
+			{
+				return false;
+			}
+
+			maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes);
+
+			instIndex++;
+		}
+
+		return true;
+	}
+#endif
+
 	const void* instBuffer1 = binary1.data.data();
 	const void* instBuffer2 = binary2.data.data();
 	usz instIndex = 0;
-	for (unsigned i = 0; i < binary1.data.size() / 4; i++)
+	while (instIndex < (binary1.data.size() / 4))
 	{
 		if (binary1.instruction_mask[instIndex])
 		{