From c59cb1bdd3e4e55f15534739c072019a5b2cfd35 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Fri, 13 Sep 2019 02:53:45 +0300 Subject: [PATCH] rsx: Allow only sse4.1 capable CPUs to take the accelerated index path - Older sets lack the required min/max functionality --- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 135 +++++++++------------------ 1 file changed, 43 insertions(+), 92 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 99c912f4ec..bda8616edc 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -627,59 +627,36 @@ namespace _mm_storeu_si128(dst_stream++, value); } - if (s_use_sse4_1) - { - const __m128i mask_step1 = _mm_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, - 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); + const __m128i mask_step1 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); - const __m128i mask_step2 = _mm_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); + const __m128i mask_step2 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); - const __m128i mask_step3 = _mm_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0x3, 0x2); + const __m128i mask_step3 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0x3, 0x2); - __m128i tmp = __mm_shuffle_epi8(min, mask_step1); - min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step2); - min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step3); - min = __mm_min_epu16(min, tmp); + __m128i tmp = __mm_shuffle_epi8(min, mask_step1); + min = __mm_min_epu16(min, tmp); + tmp = __mm_shuffle_epi8(min, mask_step2); + min = __mm_min_epu16(min, tmp); + tmp = __mm_shuffle_epi8(min, mask_step3); + min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(max, mask_step1); - max = __mm_max_epu16(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step2); - max = __mm_max_epu16(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step3); - max = __mm_max_epu16(max, tmp); + tmp = __mm_shuffle_epi8(max, mask_step1); + max = __mm_max_epu16(max, tmp); + tmp = __mm_shuffle_epi8(max, mask_step2); + max = __mm_max_epu16(max, tmp); + tmp = __mm_shuffle_epi8(max, mask_step3); + max = __mm_max_epu16(max, tmp); - const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); - const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); + const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); + const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); - return std::make_tuple(min_index, max_index, count); - } - else - { - // Manual min-max - alignas(16) u16 _min[8]; - alignas(16) u16 _max[8]; - - _mm_store_si128((__m128i*)_min, min); - _mm_store_si128((__m128i*)_max, max); - - u16 min_index = _min[0]; - u16 max_index = _max[0]; - - for (int i = 1; i < 8; ++i) - { - min_index = std::min(min_index, _min[i]); - max_index = std::max(max_index, _max[i]); - } - - return std::make_tuple(min_index, max_index, count); - } + return std::make_tuple(min_index, max_index, count); } static @@ -707,55 +684,29 @@ namespace _mm_storeu_si128(dst_stream++, value); } - if (s_use_sse4_1) - { - // Aggregate min-max - const __m128i mask_step1 = _mm_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, - 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); + // Aggregate min-max + const __m128i mask_step1 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); - const __m128i mask_step2 = _mm_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); + const __m128i mask_step2 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); - // a1, a2, a3, a4 - // a1, a2, a1, a2 - // mAX - __m128i tmp = __mm_shuffle_epi8(min, mask_step1); - min = __mm_min_epu32(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step2); - min = __mm_min_epu32(min, tmp); + __m128i tmp = __mm_shuffle_epi8(min, mask_step1); + min = __mm_min_epu32(min, tmp); + tmp = __mm_shuffle_epi8(min, mask_step2); + min = __mm_min_epu32(min, tmp); - tmp = __mm_shuffle_epi8(max, mask_step1); - max = __mm_max_epu32(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step2); - max = __mm_max_epu32(max, tmp); + tmp = __mm_shuffle_epi8(max, mask_step1); + max = __mm_max_epu32(max, tmp); + tmp = __mm_shuffle_epi8(max, mask_step2); + max = __mm_max_epu32(max, tmp); - const u32 min_index = u32(_mm_cvtsi128_si32(min)); - const u32 max_index = u32(_mm_cvtsi128_si32(max)); + const u32 min_index = u32(_mm_cvtsi128_si32(min)); + const u32 max_index = u32(_mm_cvtsi128_si32(max)); - return std::make_tuple(min_index, max_index, count); - } - else - { - // Manual min-max - alignas(16) u32 _min[4]; - alignas(16) u32 _max[4]; - - _mm_store_si128((__m128i*)_min, min); - _mm_store_si128((__m128i*)_max, max); - - u32 min_index = _min[0]; - u32 max_index = _max[0]; - - for (int i = 1; i < 4; ++i) - { - min_index = std::min(min_index, _min[i]); - max_index = std::max(max_index, _max[i]); - } - - return std::make_tuple(min_index, max_index, count); - } + return std::make_tuple(min_index, max_index, count); } template @@ -766,7 +717,7 @@ namespace u32 written; u32 remaining = src.size(); - if (s_use_ssse3 && remaining >= 32) + if (s_use_sse4_1 && remaining >= 32) { if constexpr (std::is_same::value) {