diff --git a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp index ccba75b985..f38e750609 100644 --- a/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp +++ b/rpcs3/Emu/RSX/Program/ProgramStateCache.cpp @@ -35,85 +35,85 @@ using namespace program_hash_util; -AVX512_ICL_FUNC usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program) -{ #ifdef ARCH_X64 - if (utils::has_avx512_icl()) +AVX512_ICL_FUNC usz get_vertex_program_ucode_hash_512(const RSXVertexProgram &program) +{ + // Load all elements of the instruction_mask bitset + const __m512i* instMask512 = reinterpret_cast(&program.instruction_mask); + const __m128i* instMask128 = reinterpret_cast(&program.instruction_mask); + + const __m512i lowerMask = _mm512_loadu_si512(instMask512); + const __m128i upper128 = _mm_loadu_si128(instMask128 + 4); + const __m512i upperMask = _mm512_zextsi128_si512(upper128); + + __m512i maskIndex = _mm512_setzero_si512(); + const __m512i negativeOnes = _mm512_set1_epi64(-1); + + // Special masks to test against bitset + const __m512i testMask0 = _mm512_set_epi64( + 0x0808080808080808, + 0x0808080808080808, + 0x0404040404040404, + 0x0404040404040404, + 0x0202020202020202, + 0x0202020202020202, + 0x0101010101010101, + 0x0101010101010101); + + const __m512i testMask1 = _mm512_set_epi64( + 0x8080808080808080, + 0x8080808080808080, + 0x4040404040404040, + 0x4040404040404040, + 0x2020202020202020, + 0x2020202020202020, + 0x1010101010101010, + 0x1010101010101010); + + const __m512i* instBuffer = reinterpret_cast(program.data.data()); + __m512i acc0 = _mm512_setzero_si512(); + __m512i acc1 = _mm512_setzero_si512(); + + __m512i rotMask0 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + __m512i rotMask1 = _mm512_set_epi64(15, 14, 13, 12, 11, 10, 9, 8); + __m512i rotMaskAdd = _mm512_set_epi64(16, 16, 16, 16, 16, 16, 16, 16); + + u32 instIndex = 0; + + // If there is remainder, add an extra (masked) iteration + u32 extraIteration = (program.data.size() % 32 != 0) ? 1 : 0; + u32 length = (program.data.size() / 32) + extraIteration; + + // The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop + // for the remainder, or a scalar loop. + while (instIndex < (length)) { - // Load all elements of the instruction_mask bitset - const __m512i* instMask512 = reinterpret_cast(&program.instruction_mask); - const __m128i* instMask128 = reinterpret_cast(&program.instruction_mask); + const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask); + const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0); + const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1); + const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer + instIndex * 2)); + const __m512i load1 = _mm512_maskz_loadu_epi64(result1, (instBuffer + (instIndex * 2)+ 1)); - const __m512i lowerMask = _mm512_loadu_si512(instMask512); - const __m128i upper128 = _mm_loadu_si128(instMask128 + 4); - const __m512i upperMask = _mm512_zextsi128_si512(upper128); - - __m512i maskIndex = _mm512_setzero_si512(); - const __m512i negativeOnes = _mm512_set1_epi64(-1); + const __m512i rotated0 = _mm512_rorv_epi64(load0, rotMask0); + const __m512i rotated1 = _mm512_rorv_epi64(load1, rotMask1); - // Special masks to test against bitset - const __m512i testMask0 = _mm512_set_epi64( - 0x0808080808080808, - 0x0808080808080808, - 0x0404040404040404, - 0x0404040404040404, - 0x0202020202020202, - 0x0202020202020202, - 0x0101010101010101, - 0x0101010101010101); + acc0 = _mm512_add_epi64(acc0, rotated0); + acc1 = _mm512_add_epi64(acc1, rotated1); - const __m512i testMask1 = _mm512_set_epi64( - 0x8080808080808080, - 0x8080808080808080, - 0x4040404040404040, - 0x4040404040404040, - 0x2020202020202020, - 0x2020202020202020, - 0x1010101010101010, - 0x1010101010101010); + rotMask0 = _mm512_add_epi64(rotMask0, rotMaskAdd); + rotMask1 = _mm512_add_epi64(rotMask1, rotMaskAdd); + maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes); - const __m512i* instBuffer = reinterpret_cast(program.data.data()); - __m512i acc0 = _mm512_setzero_si512(); - __m512i acc1 = _mm512_setzero_si512(); - - __m512i rotMask0 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); - __m512i rotMask1 = _mm512_set_epi64(15, 14, 13, 12, 11, 10, 9, 8); - __m512i rotMaskAdd = _mm512_set_epi64(16, 16, 16, 16, 16, 16, 16, 16); - - u32 instIndex = 0; - - // If there is remainder, add an extra (masked) iteration - u32 extraIteration = (program.data.size() % 32 != 0) ? 1 : 0; - u32 length = (program.data.size() / 32) + extraIteration; - - // The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop - // for the remainder, or a scalar loop. - while (instIndex < (length)) - { - const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask); - const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0); - const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1); - const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer + instIndex * 2)); - const __m512i load1 = _mm512_maskz_loadu_epi64(result1, (instBuffer + (instIndex * 2)+ 1)); - - const __m512i rotated0 = _mm512_rorv_epi64(load0, rotMask0); - const __m512i rotated1 = _mm512_rorv_epi64(load1, rotMask1); - - acc0 = _mm512_add_epi64(acc0, rotated0); - acc1 = _mm512_add_epi64(acc1, rotated1); - - rotMask0 = _mm512_add_epi64(rotMask0, rotMaskAdd); - rotMask1 = _mm512_add_epi64(rotMask1, rotMaskAdd); - maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes); - - instIndex++; - } - - const __m512i result = _mm512_add_epi64(acc0, acc1); - return _mm512_reduce_add_epi64(result); + instIndex++; } + + const __m512i result = _mm512_add_epi64(acc0, acc1); + return _mm512_reduce_add_epi64(result); +} #endif +usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program) +{ // Checksum as hash with rotated data const void* instbuffer = program.data.data(); u32 instIndex = 0; @@ -427,7 +427,20 @@ vertex_program_utils::vertex_program_metadata vertex_program_utils::analyse_vert usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) const { +#ifdef ARCH_X64 + usz ucode_hash; + + if (utils::has_avx512_icl()) + { + ucode_hash = get_vertex_program_ucode_hash_512(program); + } + else + { + ucode_hash = vertex_program_utils::get_vertex_program_ucode_hash(program); + } +#else const usz ucode_hash = vertex_program_utils::get_vertex_program_ucode_hash(program); +#endif const u32 state_params[] = { program.ctrl, @@ -439,21 +452,8 @@ usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) con return rpcs3::hash64(ucode_hash, metadata_hash); } -AVX512_ICL_FUNC bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const -{ - if (binary1.output_mask != binary2.output_mask) - return false; - if (binary1.ctrl != binary2.ctrl) - return false; - if (binary1.texture_state != binary2.texture_state) - return false; - if (binary1.data.size() != binary2.data.size()) - return false; - if (binary1.jump_table != binary2.jump_table) - return false; - #ifdef ARCH_X64 - if (utils::has_avx512_icl()) +AVX512_ICL_FUNC bool vertex_program_compare_512(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) { // Load all elements of the instruction_mask bitset const __m512i* instMask512 = reinterpret_cast(&binary1.instruction_mask); @@ -530,6 +530,26 @@ AVX512_ICL_FUNC bool vertex_program_compare::operator()(const RSXVertexProgram & } #endif +bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const +{ + if (binary1.output_mask != binary2.output_mask) + return false; + if (binary1.ctrl != binary2.ctrl) + return false; + if (binary1.texture_state != binary2.texture_state) + return false; + if (binary1.data.size() != binary2.data.size()) + return false; + if (binary1.jump_table != binary2.jump_table) + return false; + +#ifdef ARCH_X64 + if (utils::has_avx512_icl()) + { + return vertex_program_compare_512(binary1, binary2); + } +#endif + const void* instBuffer1 = binary1.data.data(); const void* instBuffer2 = binary2.data.data(); usz instIndex = 0;