diff --git a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp index ae7edebe48..ccba75b985 100644 --- a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp +++ b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "ProgramStateCache.h" #include "Emu/system_config.h" +#include "util/sysinfo.hpp" #include @@ -21,31 +22,119 @@ #endif #endif +#ifdef ARCH_ARM64 +#define AVX512_ICL_FUNC +#endif + +#ifdef _MSC_VER +#define AVX512_ICL_FUNC +#else +#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq"))) +#endif + + using namespace program_hash_util; -usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program) +AVX512_ICL_FUNC usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program) { - // Checksum as hash with rotated data - const void* instbuffer = program.data.data(); - u32 instIndex = 0; - usz acc0 = 0; - usz acc1 = 0; - - do +#ifdef ARCH_X64 + if (utils::has_avx512_icl()) { - if (program.instruction_mask[instIndex]) + // Load all elements of the instruction_mask bitset + const __m512i* instMask512 = reinterpret_cast(&program.instruction_mask); + const __m128i* instMask128 = reinterpret_cast(&program.instruction_mask); + + const __m512i lowerMask = _mm512_loadu_si512(instMask512); + const __m128i upper128 = _mm_loadu_si128(instMask128 + 4); + const __m512i upperMask = _mm512_zextsi128_si512(upper128); + + __m512i maskIndex = _mm512_setzero_si512(); + const __m512i negativeOnes = _mm512_set1_epi64(-1); + + // Special masks to test against bitset + const __m512i testMask0 = _mm512_set_epi64( + 0x0808080808080808, + 0x0808080808080808, + 0x0404040404040404, + 0x0404040404040404, + 0x0202020202020202, + 0x0202020202020202, + 0x0101010101010101, + 0x0101010101010101); + + const __m512i testMask1 = _mm512_set_epi64( + 0x8080808080808080, + 0x8080808080808080, + 0x4040404040404040, + 0x4040404040404040, + 0x2020202020202020, + 0x2020202020202020, + 0x1010101010101010, + 0x1010101010101010); + + const __m512i* instBuffer = reinterpret_cast(program.data.data()); + __m512i acc0 = _mm512_setzero_si512(); + __m512i acc1 = _mm512_setzero_si512(); + + __m512i rotMask0 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + __m512i rotMask1 = _mm512_set_epi64(15, 14, 13, 12, 11, 10, 9, 8); + __m512i rotMaskAdd = _mm512_set_epi64(16, 16, 16, 16, 16, 16, 16, 16); + + u32 instIndex = 0; + + // If there is remainder, add an extra (masked) iteration + u32 extraIteration = (program.data.size() % 32 != 0) ? 1 : 0; + u32 length = (program.data.size() / 32) + extraIteration; + + // The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop + // for the remainder, or a scalar loop. + while (instIndex < (length)) { - const auto inst = v128::loadu(instbuffer, instIndex); - usz tmp0 = std::rotr(inst._u64[0], instIndex * 2); - acc0 += tmp0; - usz tmp1 = std::rotr(inst._u64[1], (instIndex * 2) + 1); - acc1 += tmp1; + const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask); + const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0); + const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1); + const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer + instIndex * 2)); + const __m512i load1 = _mm512_maskz_loadu_epi64(result1, (instBuffer + (instIndex * 2)+ 1)); + + const __m512i rotated0 = _mm512_rorv_epi64(load0, rotMask0); + const __m512i rotated1 = _mm512_rorv_epi64(load1, rotMask1); + + acc0 = _mm512_add_epi64(acc0, rotated0); + acc1 = _mm512_add_epi64(acc1, rotated1); + + rotMask0 = _mm512_add_epi64(rotMask0, rotMaskAdd); + rotMask1 = _mm512_add_epi64(rotMask1, rotMaskAdd); + maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes); + + instIndex++; } - instIndex++; - } while (instIndex < (program.data.size() / 4)); + const __m512i result = _mm512_add_epi64(acc0, acc1); + return _mm512_reduce_add_epi64(result); + } +#endif + + // Checksum as hash with rotated data + const void* instbuffer = program.data.data(); + u32 instIndex = 0; + usz acc0 = 0; + usz acc1 = 0; + + do + { + if (program.instruction_mask[instIndex]) + { + const auto inst = v128::loadu(instbuffer, instIndex); + usz tmp0 = std::rotr(inst._u64[0], instIndex * 2); + acc0 += tmp0; + usz tmp1 = std::rotr(inst._u64[1], (instIndex * 2) + 1); + acc1 += tmp1; + } + + instIndex++; + } while (instIndex < (program.data.size() / 4)); return acc0 + acc1; -} + } vertex_program_utils::vertex_program_metadata vertex_program_utils::analyse_vertex_program(const u32* data, u32 entry, RSXVertexProgram& dst_prog) { @@ -350,7 +439,7 @@ usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) con return rpcs3::hash64(ucode_hash, metadata_hash); } -bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const +AVX512_ICL_FUNC bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const { if (binary1.output_mask != binary2.output_mask) return false; @@ -363,10 +452,88 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R if (binary1.jump_table != binary2.jump_table) return false; +#ifdef ARCH_X64 + if (utils::has_avx512_icl()) + { + // Load all elements of the instruction_mask bitset + const __m512i* instMask512 = reinterpret_cast(&binary1.instruction_mask); + const __m128i* instMask128 = reinterpret_cast(&binary1.instruction_mask); + + const __m512i lowerMask = _mm512_loadu_si512(instMask512); + const __m128i upper128 = _mm_loadu_si128(instMask128 + 4); + const __m512i upperMask = _mm512_zextsi128_si512(upper128); + + __m512i maskIndex = _mm512_setzero_si512(); + const __m512i negativeOnes = _mm512_set1_epi64(-1); + + // Special masks to test against bitset + const __m512i testMask0 = _mm512_set_epi64( + 0x0808080808080808, + 0x0808080808080808, + 0x0404040404040404, + 0x0404040404040404, + 0x0202020202020202, + 0x0202020202020202, + 0x0101010101010101, + 0x0101010101010101); + + const __m512i testMask1 = _mm512_set_epi64( + 0x8080808080808080, + 0x8080808080808080, + 0x4040404040404040, + 0x4040404040404040, + 0x2020202020202020, + 0x2020202020202020, + 0x1010101010101010, + 0x1010101010101010); + + const __m512i* instBuffer1 = reinterpret_cast(binary1.data.data()); + const __m512i* instBuffer2 = reinterpret_cast(binary2.data.data()); + + // If there is remainder, add an extra (masked) iteration + u32 extraIteration = (binary1.data.size() % 32 != 0) ? 1 : 0; + u32 length = (binary1.data.size() / 32) + extraIteration; + + u32 instIndex = 0; + + // The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop + // for the remainder, or a scalar loop. + while (instIndex < (length)) + { + const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask); + + const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0); + const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1); + + const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer1 + (instIndex * 2))); + const __m512i load1 = _mm512_maskz_loadu_epi64(result0, (instBuffer2 + (instIndex * 2))); + const __m512i load2 = _mm512_maskz_loadu_epi64(result1, (instBuffer1 + (instIndex * 2) + 1)); + const __m512i load3 = _mm512_maskz_loadu_epi64(result1, (instBuffer2 + (instIndex * 2)+ 1)); + + const __mmask8 res0 = _mm512_cmpneq_epi64_mask(load0, load1); + const __mmask8 res1 = _mm512_cmpneq_epi64_mask(load2, load3); + + const u8 result = _kortestz_mask8_u8(res0, res1); + + //kortestz will set result to 1 if all bits are zero, so invert the check for result + if (!result) + { + return false; + } + + maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes); + + instIndex++; + } + + return true; + } +#endif + const void* instBuffer1 = binary1.data.data(); const void* instBuffer2 = binary2.data.data(); usz instIndex = 0; - for (unsigned i = 0; i < binary1.data.size() / 4; i++) + while (instIndex < (binary1.data.size() / 4)) { if (binary1.instruction_mask[instIndex]) {