diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 0250daead2..94359b9841 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -15,42 +15,57 @@ #define SSE4_1_FUNC #define AVX2_FUNC #define AVX3_FUNC +#define AVX512_ICL_FUNC #else #define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) #define AVX2_FUNC __attribute__((__target__("avx2"))) #define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl"))) +#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq"))) #endif // _MSC_VER -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && defined(__AVX512BITALG__) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && defined(__AVX512VBMI2__) && defined(__AVX512VNNI__) && defined(__AVX512VPOPCNTDQ__) [[maybe_unused]] constexpr bool s_use_ssse3 = true; [[maybe_unused]] constexpr bool s_use_sse4_1 = true; [[maybe_unused]] constexpr bool s_use_avx2 = true; [[maybe_unused]] constexpr bool s_use_avx3 = true; +[[maybe_unused]] constexpr bool s_use_avx512_icl = true; +#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) +[[maybe_unused]] constexpr bool s_use_ssse3 = true; +[[maybe_unused]] constexpr bool s_use_sse4_1 = true; +[[maybe_unused]] constexpr bool s_use_avx2 = true; +[[maybe_unused]] constexpr bool s_use_avx3 = true; +[[maybe_unused]] constexpr bool s_use_avx512_icl = false; #elif defined(__AVX2__) [[maybe_unused]] constexpr bool s_use_ssse3 = true; [[maybe_unused]] constexpr bool s_use_sse4_1 = true; [[maybe_unused]] constexpr bool s_use_avx2 = true; [[maybe_unused]] constexpr bool s_use_avx3 = false; +[[maybe_unused]] constexpr bool s_use_avx512_icl = false; #elif defined(__SSE4_1__) [[maybe_unused]] constexpr bool s_use_ssse3 = true; [[maybe_unused]] constexpr bool s_use_sse4_1 = true; [[maybe_unused]] constexpr bool s_use_avx2 = false; [[maybe_unused]] constexpr bool s_use_avx3 = false; +[[maybe_unused]] constexpr bool s_use_avx512_icl = false; #elif defined(__SSSE3__) [[maybe_unused]] constexpr bool s_use_ssse3 = true; [[maybe_unused]] constexpr bool s_use_sse4_1 = false; [[maybe_unused]] constexpr bool s_use_avx2 = false; [[maybe_unused]] constexpr bool s_use_avx3 = false; +[[maybe_unused]] constexpr bool s_use_avx512_icl = false; #elif defined(ARCH_X64) [[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3(); [[maybe_unused]] const bool s_use_sse4_1 = utils::has_sse41(); [[maybe_unused]] const bool s_use_avx2 = utils::has_avx2(); [[maybe_unused]] const bool s_use_avx3 = utils::has_avx512(); +[[maybe_unused]] const bool s_use_avx512_icl = utils::has_avx512_icl(); #else [[maybe_unused]] constexpr bool s_use_ssse3 = true; // Non x86 [[maybe_unused]] constexpr bool s_use_sse4_1 = true; // Non x86 [[maybe_unused]] constexpr bool s_use_avx2 = false; [[maybe_unused]] constexpr bool s_use_avx3 = false; +[[maybe_unused]] constexpr bool s_use_avx512_icl = false; #endif const v128 s_bswap_u32_mask = v128::from32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); @@ -404,6 +419,153 @@ namespace } }; + + +#if defined(ARCH_X64) + +SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x) +{ + return _mm_cvtsi128_si32(_mm_minpos_epu16(x)); +} + +SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x) +{ + return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1)))); +} + +AVX512_ICL_FUNC +static +std::tuple upload_u16_swapped_avx512_icl_skip_restart(const void *src, void *dst, u32 count, u16 restart_index) +{ + const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask); + + auto src_stream = static_cast(src); + auto dst_stream = static_cast(dst); + + const __m512i restart = _mm512_set1_epi16(restart_index); + __m512i min = _mm512_set1_epi16(-1); + __m512i max = _mm512_set1_epi16(0); + const __m512i ones = _mm512_set1_epi16(-1); + + int written = 0; + + const auto iterations = count / 32; + for (u32 i = 0; i < iterations; i++) + { + const __m512i raw = _mm512_loadu_si512(src_stream++); + const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512); + const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value); + const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value); + + max = _mm512_mask_max_epu16(max, mask, max, value); + min = _mm512_mask_min_epu16(min, mask, min, value); + const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart); + + const int processed = _mm_popcnt_u32(mask); + _mm512_storeu_si512(dst_stream, packed); + dst_stream += processed; + written += processed; + } + + u32 remainder = count % 32; + if (remainder > 0) + { + const __mmask32 rem_mask = (1U << remainder) - 1; + const __m512i raw = _mm512_maskz_loadu_epi16(rem_mask, src_stream); + const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512); + const __mmask32 mask = _mm512_mask_cmpneq_epi16_mask(rem_mask, restart, value); + + const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value); + max = _mm512_mask_max_epu16(max, mask, max, value); + min = _mm512_mask_min_epu16(min, mask, min, value); + const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart); + + const int processed = _mm_popcnt_u32(mask); + const __mmask32 store_mask = (1U << processed) - 1; + _mm512_mask_storeu_epi16(dst_stream, store_mask, packed); + written += processed; + } + + __m256i tmp256 = _mm512_extracti64x4_epi64(min, 1); + __m256i min2 = _mm512_castsi512_si256(min); + min2 = _mm256_min_epu16(min2, tmp256); + __m128i tmp = _mm256_extracti128_si256(min2, 1); + __m128i min3 = _mm256_castsi256_si128(min2); + min3 = _mm_min_epu16(min3, tmp); + + tmp256 = _mm512_extracti64x4_epi64(max, 1); + __m256i max2 = _mm512_castsi512_si256(max); + max2 = _mm256_max_epu16(max2, tmp256); + tmp = _mm256_extracti128_si256(max2, 1); + __m128i max3 = _mm256_castsi256_si128(max2); + max3 = _mm_max_epu16(max3, tmp); + + const u16 min_index = sse41_hmin_epu16(min3); + const u16 max_index = sse41_hmax_epu16(max3); + + return std::make_tuple(min_index, max_index, written); +} + +AVX3_FUNC +static +std::tuple upload_u32_swapped_avx3_skip_restart(const void *src, void *dst, u32 count, u32 restart_index) +{ + const __m512i s_bswap_u32_mask512 = _mm512_broadcast_i32x4(s_bswap_u32_mask); + + auto src_stream = static_cast(src); + auto dst_stream = static_cast(dst); + + const __m512i restart = _mm512_set1_epi32(restart_index); + __m512i min = _mm512_set1_epi32(-1); + __m512i max = _mm512_set1_epi32(0); + const __m512i ones = _mm512_set1_epi32(-1); + + int written = 0; + + const u32 iterations = count / 16; + for (u32 i = 0; i < iterations; i++) + { + const __m512i raw = _mm512_loadu_si512(src_stream++); + const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512); + const __mmask16 mask = _mm512_cmpneq_epi32_mask(restart, value); + const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value); + + max = _mm512_mask_max_epu32(max, mask, max, value); + min = _mm512_mask_min_epu32(min, mask, min, value); + const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart); + + const int processed = _mm_popcnt_u32(mask); + _mm512_storeu_si512(dst_stream, packed); + dst_stream += processed; + written += processed; + } + + u32 remainder = count % 16; + if (remainder > 0) + { + const __mmask16 rem_mask = (1U << remainder) - 1; + const __m512i raw = _mm512_maskz_loadu_epi32(rem_mask, src_stream); + const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512); + + const __mmask16 mask = _mm512_mask_cmpneq_epi32_mask(rem_mask, restart, value); + const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value); + max = _mm512_mask_max_epu32(max, mask, max, value); + min = _mm512_mask_min_epu32(min, mask, min, value); + const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart); + + const int processed = _mm_popcnt_u32(mask); + const __mmask16 store_mask = (1U << processed) - 1; + _mm512_mask_storeu_epi32(dst_stream, store_mask, packed); + written += processed; + } + + u32 min_index = _mm512_reduce_min_epu32(min); + u32 max_index = _mm512_reduce_max_epu32(max); + + return std::make_tuple(min_index, max_index, written); +} +#endif + template NEVER_INLINE std::tuple upload_untouched_skip_restart(std::span> src, std::span dst, T restart_index) { @@ -412,6 +574,26 @@ NEVER_INLINE std::tuple upload_untouched_skip_restart(std::span) + { + if (s_use_avx512_icl) + { + std::tie(min_index, max_index, written) = upload_u16_swapped_avx512_icl_skip_restart(src.data(), dst.data(), length, restart_index); + return std::make_tuple(min_index, max_index, written); + } + } + + if constexpr (std::is_same_v) + { + if (s_use_avx3) + { + std::tie(min_index, max_index, written) = upload_u32_swapped_avx3_skip_restart(src.data(), dst.data(), length, restart_index); + return std::make_tuple(min_index, max_index, written); + } + } +#endif + for (u32 i = written; i < length; ++i) { T index = src[i];