mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-20 03:25:16 +00:00
RSX: Fix issue where linux builds could hit illegal instruction on machines without AVX-512
- Place avx-512 function attributes in their own functions
This commit is contained in:
parent
783079266e
commit
1920aee415
1 changed files with 104 additions and 84 deletions
|
@ -35,85 +35,85 @@
|
|||
|
||||
using namespace program_hash_util;
|
||||
|
||||
AVX512_ICL_FUNC usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program)
|
||||
{
|
||||
#ifdef ARCH_X64
|
||||
if (utils::has_avx512_icl())
|
||||
AVX512_ICL_FUNC usz get_vertex_program_ucode_hash_512(const RSXVertexProgram &program)
|
||||
{
|
||||
// Load all elements of the instruction_mask bitset
|
||||
const __m512i* instMask512 = reinterpret_cast<const __m512i*>(&program.instruction_mask);
|
||||
const __m128i* instMask128 = reinterpret_cast<const __m128i*>(&program.instruction_mask);
|
||||
|
||||
const __m512i lowerMask = _mm512_loadu_si512(instMask512);
|
||||
const __m128i upper128 = _mm_loadu_si128(instMask128 + 4);
|
||||
const __m512i upperMask = _mm512_zextsi128_si512(upper128);
|
||||
|
||||
__m512i maskIndex = _mm512_setzero_si512();
|
||||
const __m512i negativeOnes = _mm512_set1_epi64(-1);
|
||||
|
||||
// Special masks to test against bitset
|
||||
const __m512i testMask0 = _mm512_set_epi64(
|
||||
0x0808080808080808,
|
||||
0x0808080808080808,
|
||||
0x0404040404040404,
|
||||
0x0404040404040404,
|
||||
0x0202020202020202,
|
||||
0x0202020202020202,
|
||||
0x0101010101010101,
|
||||
0x0101010101010101);
|
||||
|
||||
const __m512i testMask1 = _mm512_set_epi64(
|
||||
0x8080808080808080,
|
||||
0x8080808080808080,
|
||||
0x4040404040404040,
|
||||
0x4040404040404040,
|
||||
0x2020202020202020,
|
||||
0x2020202020202020,
|
||||
0x1010101010101010,
|
||||
0x1010101010101010);
|
||||
|
||||
const __m512i* instBuffer = reinterpret_cast<const __m512i*>(program.data.data());
|
||||
__m512i acc0 = _mm512_setzero_si512();
|
||||
__m512i acc1 = _mm512_setzero_si512();
|
||||
|
||||
__m512i rotMask0 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
__m512i rotMask1 = _mm512_set_epi64(15, 14, 13, 12, 11, 10, 9, 8);
|
||||
__m512i rotMaskAdd = _mm512_set_epi64(16, 16, 16, 16, 16, 16, 16, 16);
|
||||
|
||||
u32 instIndex = 0;
|
||||
|
||||
// If there is remainder, add an extra (masked) iteration
|
||||
u32 extraIteration = (program.data.size() % 32 != 0) ? 1 : 0;
|
||||
u32 length = (program.data.size() / 32) + extraIteration;
|
||||
|
||||
// The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop
|
||||
// for the remainder, or a scalar loop.
|
||||
while (instIndex < (length))
|
||||
{
|
||||
// Load all elements of the instruction_mask bitset
|
||||
const __m512i* instMask512 = reinterpret_cast<const __m512i*>(&program.instruction_mask);
|
||||
const __m128i* instMask128 = reinterpret_cast<const __m128i*>(&program.instruction_mask);
|
||||
const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask);
|
||||
const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0);
|
||||
const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1);
|
||||
const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer + instIndex * 2));
|
||||
const __m512i load1 = _mm512_maskz_loadu_epi64(result1, (instBuffer + (instIndex * 2)+ 1));
|
||||
|
||||
const __m512i lowerMask = _mm512_loadu_si512(instMask512);
|
||||
const __m128i upper128 = _mm_loadu_si128(instMask128 + 4);
|
||||
const __m512i upperMask = _mm512_zextsi128_si512(upper128);
|
||||
|
||||
__m512i maskIndex = _mm512_setzero_si512();
|
||||
const __m512i negativeOnes = _mm512_set1_epi64(-1);
|
||||
const __m512i rotated0 = _mm512_rorv_epi64(load0, rotMask0);
|
||||
const __m512i rotated1 = _mm512_rorv_epi64(load1, rotMask1);
|
||||
|
||||
// Special masks to test against bitset
|
||||
const __m512i testMask0 = _mm512_set_epi64(
|
||||
0x0808080808080808,
|
||||
0x0808080808080808,
|
||||
0x0404040404040404,
|
||||
0x0404040404040404,
|
||||
0x0202020202020202,
|
||||
0x0202020202020202,
|
||||
0x0101010101010101,
|
||||
0x0101010101010101);
|
||||
acc0 = _mm512_add_epi64(acc0, rotated0);
|
||||
acc1 = _mm512_add_epi64(acc1, rotated1);
|
||||
|
||||
const __m512i testMask1 = _mm512_set_epi64(
|
||||
0x8080808080808080,
|
||||
0x8080808080808080,
|
||||
0x4040404040404040,
|
||||
0x4040404040404040,
|
||||
0x2020202020202020,
|
||||
0x2020202020202020,
|
||||
0x1010101010101010,
|
||||
0x1010101010101010);
|
||||
rotMask0 = _mm512_add_epi64(rotMask0, rotMaskAdd);
|
||||
rotMask1 = _mm512_add_epi64(rotMask1, rotMaskAdd);
|
||||
maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes);
|
||||
|
||||
const __m512i* instBuffer = reinterpret_cast<const __m512i*>(program.data.data());
|
||||
__m512i acc0 = _mm512_setzero_si512();
|
||||
__m512i acc1 = _mm512_setzero_si512();
|
||||
|
||||
__m512i rotMask0 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
__m512i rotMask1 = _mm512_set_epi64(15, 14, 13, 12, 11, 10, 9, 8);
|
||||
__m512i rotMaskAdd = _mm512_set_epi64(16, 16, 16, 16, 16, 16, 16, 16);
|
||||
|
||||
u32 instIndex = 0;
|
||||
|
||||
// If there is remainder, add an extra (masked) iteration
|
||||
u32 extraIteration = (program.data.size() % 32 != 0) ? 1 : 0;
|
||||
u32 length = (program.data.size() / 32) + extraIteration;
|
||||
|
||||
// The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop
|
||||
// for the remainder, or a scalar loop.
|
||||
while (instIndex < (length))
|
||||
{
|
||||
const __m512i masks = _mm512_permutex2var_epi8(lowerMask, maskIndex, upperMask);
|
||||
const __mmask8 result0 = _mm512_test_epi64_mask(masks, testMask0);
|
||||
const __mmask8 result1 = _mm512_test_epi64_mask(masks, testMask1);
|
||||
const __m512i load0 = _mm512_maskz_loadu_epi64(result0, (instBuffer + instIndex * 2));
|
||||
const __m512i load1 = _mm512_maskz_loadu_epi64(result1, (instBuffer + (instIndex * 2)+ 1));
|
||||
|
||||
const __m512i rotated0 = _mm512_rorv_epi64(load0, rotMask0);
|
||||
const __m512i rotated1 = _mm512_rorv_epi64(load1, rotMask1);
|
||||
|
||||
acc0 = _mm512_add_epi64(acc0, rotated0);
|
||||
acc1 = _mm512_add_epi64(acc1, rotated1);
|
||||
|
||||
rotMask0 = _mm512_add_epi64(rotMask0, rotMaskAdd);
|
||||
rotMask1 = _mm512_add_epi64(rotMask1, rotMaskAdd);
|
||||
maskIndex = _mm512_sub_epi8(maskIndex, negativeOnes);
|
||||
|
||||
instIndex++;
|
||||
}
|
||||
|
||||
const __m512i result = _mm512_add_epi64(acc0, acc1);
|
||||
return _mm512_reduce_add_epi64(result);
|
||||
instIndex++;
|
||||
}
|
||||
|
||||
const __m512i result = _mm512_add_epi64(acc0, acc1);
|
||||
return _mm512_reduce_add_epi64(result);
|
||||
}
|
||||
#endif
|
||||
|
||||
usz vertex_program_utils::get_vertex_program_ucode_hash(const RSXVertexProgram &program)
|
||||
{
|
||||
// Checksum as hash with rotated data
|
||||
const void* instbuffer = program.data.data();
|
||||
u32 instIndex = 0;
|
||||
|
@ -427,7 +427,20 @@ vertex_program_utils::vertex_program_metadata vertex_program_utils::analyse_vert
|
|||
|
||||
usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) const
|
||||
{
|
||||
#ifdef ARCH_X64
|
||||
usz ucode_hash;
|
||||
|
||||
if (utils::has_avx512_icl())
|
||||
{
|
||||
ucode_hash = get_vertex_program_ucode_hash_512(program);
|
||||
}
|
||||
else
|
||||
{
|
||||
ucode_hash = vertex_program_utils::get_vertex_program_ucode_hash(program);
|
||||
}
|
||||
#else
|
||||
const usz ucode_hash = vertex_program_utils::get_vertex_program_ucode_hash(program);
|
||||
#endif
|
||||
const u32 state_params[] =
|
||||
{
|
||||
program.ctrl,
|
||||
|
@ -439,21 +452,8 @@ usz vertex_program_storage_hash::operator()(const RSXVertexProgram &program) con
|
|||
return rpcs3::hash64(ucode_hash, metadata_hash);
|
||||
}
|
||||
|
||||
AVX512_ICL_FUNC bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const
|
||||
{
|
||||
if (binary1.output_mask != binary2.output_mask)
|
||||
return false;
|
||||
if (binary1.ctrl != binary2.ctrl)
|
||||
return false;
|
||||
if (binary1.texture_state != binary2.texture_state)
|
||||
return false;
|
||||
if (binary1.data.size() != binary2.data.size())
|
||||
return false;
|
||||
if (binary1.jump_table != binary2.jump_table)
|
||||
return false;
|
||||
|
||||
#ifdef ARCH_X64
|
||||
if (utils::has_avx512_icl())
|
||||
AVX512_ICL_FUNC bool vertex_program_compare_512(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2)
|
||||
{
|
||||
// Load all elements of the instruction_mask bitset
|
||||
const __m512i* instMask512 = reinterpret_cast<const __m512i*>(&binary1.instruction_mask);
|
||||
|
@ -530,6 +530,26 @@ AVX512_ICL_FUNC bool vertex_program_compare::operator()(const RSXVertexProgram &
|
|||
}
|
||||
#endif
|
||||
|
||||
bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const RSXVertexProgram &binary2) const
|
||||
{
|
||||
if (binary1.output_mask != binary2.output_mask)
|
||||
return false;
|
||||
if (binary1.ctrl != binary2.ctrl)
|
||||
return false;
|
||||
if (binary1.texture_state != binary2.texture_state)
|
||||
return false;
|
||||
if (binary1.data.size() != binary2.data.size())
|
||||
return false;
|
||||
if (binary1.jump_table != binary2.jump_table)
|
||||
return false;
|
||||
|
||||
#ifdef ARCH_X64
|
||||
if (utils::has_avx512_icl())
|
||||
{
|
||||
return vertex_program_compare_512(binary1, binary2);
|
||||
}
|
||||
#endif
|
||||
|
||||
const void* instBuffer1 = binary1.data.data();
|
||||
const void* instBuffer2 = binary2.data.data();
|
||||
usz instIndex = 0;
|
||||
|
|
Loading…
Add table
Reference in a new issue