mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-19 19:15:26 +00:00
BufferUtils: Optimize upload_untoucheed_skip_restart with AVX-512 paths
- u16 path needs AVX-512-ICL because vpcompressw isn't included in skylake-x level AVX-512 - the u32 path is untested as I couldn't find any games that hit it
This commit is contained in:
parent
faef63e8a7
commit
430c3edede
1 changed files with 183 additions and 1 deletions
|
@ -15,42 +15,57 @@
|
|||
#define SSE4_1_FUNC
|
||||
#define AVX2_FUNC
|
||||
#define AVX3_FUNC
|
||||
#define AVX512_ICL_FUNC
|
||||
#else
|
||||
#define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
|
||||
#define AVX2_FUNC __attribute__((__target__("avx2")))
|
||||
#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl")))
|
||||
#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq")))
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && defined(__AVX512BITALG__) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && defined(__AVX512VBMI2__) && defined(__AVX512VNNI__) && defined(__AVX512VPOPCNTDQ__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx2 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx512_icl = true;
|
||||
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx2 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
|
||||
#elif defined(__AVX2__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx2 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
|
||||
#elif defined(__SSE4_1__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx2 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
|
||||
#elif defined(__SSSE3__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_sse4_1 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx2 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
|
||||
#elif defined(ARCH_X64)
|
||||
[[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3();
|
||||
[[maybe_unused]] const bool s_use_sse4_1 = utils::has_sse41();
|
||||
[[maybe_unused]] const bool s_use_avx2 = utils::has_avx2();
|
||||
[[maybe_unused]] const bool s_use_avx3 = utils::has_avx512();
|
||||
[[maybe_unused]] const bool s_use_avx512_icl = utils::has_avx512_icl();
|
||||
#else
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true; // Non x86
|
||||
[[maybe_unused]] constexpr bool s_use_sse4_1 = true; // Non x86
|
||||
[[maybe_unused]] constexpr bool s_use_avx2 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
|
||||
#endif
|
||||
|
||||
const v128 s_bswap_u32_mask = v128::from32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
|
@ -404,6 +419,153 @@ namespace
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
|
||||
SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x)
|
||||
{
|
||||
return _mm_cvtsi128_si32(_mm_minpos_epu16(x));
|
||||
}
|
||||
|
||||
SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x)
|
||||
{
|
||||
return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1))));
|
||||
}
|
||||
|
||||
AVX512_ICL_FUNC
|
||||
static
|
||||
std::tuple<u16, u16, u32> upload_u16_swapped_avx512_icl_skip_restart(const void *src, void *dst, u32 count, u16 restart_index)
|
||||
{
|
||||
const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask);
|
||||
|
||||
auto src_stream = static_cast<const __m512*>(src);
|
||||
auto dst_stream = static_cast<u16 *>(dst);
|
||||
|
||||
const __m512i restart = _mm512_set1_epi16(restart_index);
|
||||
__m512i min = _mm512_set1_epi16(-1);
|
||||
__m512i max = _mm512_set1_epi16(0);
|
||||
const __m512i ones = _mm512_set1_epi16(-1);
|
||||
|
||||
int written = 0;
|
||||
|
||||
const auto iterations = count / 32;
|
||||
for (u32 i = 0; i < iterations; i++)
|
||||
{
|
||||
const __m512i raw = _mm512_loadu_si512(src_stream++);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
|
||||
const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value);
|
||||
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
|
||||
|
||||
max = _mm512_mask_max_epu16(max, mask, max, value);
|
||||
min = _mm512_mask_min_epu16(min, mask, min, value);
|
||||
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);
|
||||
|
||||
const int processed = _mm_popcnt_u32(mask);
|
||||
_mm512_storeu_si512(dst_stream, packed);
|
||||
dst_stream += processed;
|
||||
written += processed;
|
||||
}
|
||||
|
||||
u32 remainder = count % 32;
|
||||
if (remainder > 0)
|
||||
{
|
||||
const __mmask32 rem_mask = (1U << remainder) - 1;
|
||||
const __m512i raw = _mm512_maskz_loadu_epi16(rem_mask, src_stream);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
|
||||
const __mmask32 mask = _mm512_mask_cmpneq_epi16_mask(rem_mask, restart, value);
|
||||
|
||||
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
|
||||
max = _mm512_mask_max_epu16(max, mask, max, value);
|
||||
min = _mm512_mask_min_epu16(min, mask, min, value);
|
||||
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);
|
||||
|
||||
const int processed = _mm_popcnt_u32(mask);
|
||||
const __mmask32 store_mask = (1U << processed) - 1;
|
||||
_mm512_mask_storeu_epi16(dst_stream, store_mask, packed);
|
||||
written += processed;
|
||||
}
|
||||
|
||||
__m256i tmp256 = _mm512_extracti64x4_epi64(min, 1);
|
||||
__m256i min2 = _mm512_castsi512_si256(min);
|
||||
min2 = _mm256_min_epu16(min2, tmp256);
|
||||
__m128i tmp = _mm256_extracti128_si256(min2, 1);
|
||||
__m128i min3 = _mm256_castsi256_si128(min2);
|
||||
min3 = _mm_min_epu16(min3, tmp);
|
||||
|
||||
tmp256 = _mm512_extracti64x4_epi64(max, 1);
|
||||
__m256i max2 = _mm512_castsi512_si256(max);
|
||||
max2 = _mm256_max_epu16(max2, tmp256);
|
||||
tmp = _mm256_extracti128_si256(max2, 1);
|
||||
__m128i max3 = _mm256_castsi256_si128(max2);
|
||||
max3 = _mm_max_epu16(max3, tmp);
|
||||
|
||||
const u16 min_index = sse41_hmin_epu16(min3);
|
||||
const u16 max_index = sse41_hmax_epu16(max3);
|
||||
|
||||
return std::make_tuple(min_index, max_index, written);
|
||||
}
|
||||
|
||||
AVX3_FUNC
|
||||
static
|
||||
std::tuple<u32, u32, u32> upload_u32_swapped_avx3_skip_restart(const void *src, void *dst, u32 count, u32 restart_index)
|
||||
{
|
||||
const __m512i s_bswap_u32_mask512 = _mm512_broadcast_i32x4(s_bswap_u32_mask);
|
||||
|
||||
auto src_stream = static_cast<const __m512i*>(src);
|
||||
auto dst_stream = static_cast<u32 *>(dst);
|
||||
|
||||
const __m512i restart = _mm512_set1_epi32(restart_index);
|
||||
__m512i min = _mm512_set1_epi32(-1);
|
||||
__m512i max = _mm512_set1_epi32(0);
|
||||
const __m512i ones = _mm512_set1_epi32(-1);
|
||||
|
||||
int written = 0;
|
||||
|
||||
const u32 iterations = count / 16;
|
||||
for (u32 i = 0; i < iterations; i++)
|
||||
{
|
||||
const __m512i raw = _mm512_loadu_si512(src_stream++);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
|
||||
const __mmask16 mask = _mm512_cmpneq_epi32_mask(restart, value);
|
||||
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
|
||||
|
||||
max = _mm512_mask_max_epu32(max, mask, max, value);
|
||||
min = _mm512_mask_min_epu32(min, mask, min, value);
|
||||
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);
|
||||
|
||||
const int processed = _mm_popcnt_u32(mask);
|
||||
_mm512_storeu_si512(dst_stream, packed);
|
||||
dst_stream += processed;
|
||||
written += processed;
|
||||
}
|
||||
|
||||
u32 remainder = count % 16;
|
||||
if (remainder > 0)
|
||||
{
|
||||
const __mmask16 rem_mask = (1U << remainder) - 1;
|
||||
const __m512i raw = _mm512_maskz_loadu_epi32(rem_mask, src_stream);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
|
||||
|
||||
const __mmask16 mask = _mm512_mask_cmpneq_epi32_mask(rem_mask, restart, value);
|
||||
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
|
||||
max = _mm512_mask_max_epu32(max, mask, max, value);
|
||||
min = _mm512_mask_min_epu32(min, mask, min, value);
|
||||
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);
|
||||
|
||||
const int processed = _mm_popcnt_u32(mask);
|
||||
const __mmask16 store_mask = (1U << processed) - 1;
|
||||
_mm512_mask_storeu_epi32(dst_stream, store_mask, packed);
|
||||
written += processed;
|
||||
}
|
||||
|
||||
u32 min_index = _mm512_reduce_min_epu32(min);
|
||||
u32 max_index = _mm512_reduce_max_epu32(max);
|
||||
|
||||
return std::make_tuple(min_index, max_index, written);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
|
||||
{
|
||||
|
@ -412,6 +574,26 @@ NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be
|
|||
u32 written = 0;
|
||||
u32 length = ::size32(src);
|
||||
|
||||
#if defined(ARCH_X64)
|
||||
if constexpr (std::is_same_v<T, u16>)
|
||||
{
|
||||
if (s_use_avx512_icl)
|
||||
{
|
||||
std::tie(min_index, max_index, written) = upload_u16_swapped_avx512_icl_skip_restart(src.data(), dst.data(), length, restart_index);
|
||||
return std::make_tuple(min_index, max_index, written);
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, u32>)
|
||||
{
|
||||
if (s_use_avx3)
|
||||
{
|
||||
std::tie(min_index, max_index, written) = upload_u32_swapped_avx3_skip_restart(src.data(), dst.data(), length, restart_index);
|
||||
return std::make_tuple(min_index, max_index, written);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (u32 i = written; i < length; ++i)
|
||||
{
|
||||
T index = src[i];
|
||||
|
|
Loading…
Add table
Reference in a new issue