BufferUtils: Optimize upload_untoucheed_skip_restart with AVX-512 paths

- u16 path needs AVX-512-ICL because vpcompressw isn't included in skylake-x level AVX-512
- the u32 path is untested as I couldn't find any games that hit it
This commit is contained in:
Malcolm Jestadt 2025-03-26 17:41:14 -04:00
parent faef63e8a7
commit 430c3edede

View file

@ -15,42 +15,57 @@
#define SSE4_1_FUNC
#define AVX2_FUNC
#define AVX3_FUNC
#define AVX512_ICL_FUNC
#else
#define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
#define AVX2_FUNC __attribute__((__target__("avx2")))
#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl")))
#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq")))
#endif // _MSC_VER
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && defined(__AVX512BITALG__) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && defined(__AVX512VBMI2__) && defined(__AVX512VNNI__) && defined(__AVX512VPOPCNTDQ__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = true;
[[maybe_unused]] constexpr bool s_use_avx3 = true;
[[maybe_unused]] constexpr bool s_use_avx512_icl = true;
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = true;
[[maybe_unused]] constexpr bool s_use_avx3 = true;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(__AVX2__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = true;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(__SSE4_1__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
[[maybe_unused]] constexpr bool s_use_avx2 = false;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(__SSSE3__)
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
[[maybe_unused]] constexpr bool s_use_sse4_1 = false;
[[maybe_unused]] constexpr bool s_use_avx2 = false;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#elif defined(ARCH_X64)
[[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3();
[[maybe_unused]] const bool s_use_sse4_1 = utils::has_sse41();
[[maybe_unused]] const bool s_use_avx2 = utils::has_avx2();
[[maybe_unused]] const bool s_use_avx3 = utils::has_avx512();
[[maybe_unused]] const bool s_use_avx512_icl = utils::has_avx512_icl();
#else
[[maybe_unused]] constexpr bool s_use_ssse3 = true; // Non x86
[[maybe_unused]] constexpr bool s_use_sse4_1 = true; // Non x86
[[maybe_unused]] constexpr bool s_use_avx2 = false;
[[maybe_unused]] constexpr bool s_use_avx3 = false;
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
#endif
const v128 s_bswap_u32_mask = v128::from32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
@ -404,6 +419,153 @@ namespace
}
};
#if defined(ARCH_X64)
SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x)
{
return _mm_cvtsi128_si32(_mm_minpos_epu16(x));
}
SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x)
{
return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1))));
}
AVX512_ICL_FUNC
static
std::tuple<u16, u16, u32> upload_u16_swapped_avx512_icl_skip_restart(const void *src, void *dst, u32 count, u16 restart_index)
{
const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask);
auto src_stream = static_cast<const __m512*>(src);
auto dst_stream = static_cast<u16 *>(dst);
const __m512i restart = _mm512_set1_epi16(restart_index);
__m512i min = _mm512_set1_epi16(-1);
__m512i max = _mm512_set1_epi16(0);
const __m512i ones = _mm512_set1_epi16(-1);
int written = 0;
const auto iterations = count / 32;
for (u32 i = 0; i < iterations; i++)
{
const __m512i raw = _mm512_loadu_si512(src_stream++);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
max = _mm512_mask_max_epu16(max, mask, max, value);
min = _mm512_mask_min_epu16(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);
const int processed = _mm_popcnt_u32(mask);
_mm512_storeu_si512(dst_stream, packed);
dst_stream += processed;
written += processed;
}
u32 remainder = count % 32;
if (remainder > 0)
{
const __mmask32 rem_mask = (1U << remainder) - 1;
const __m512i raw = _mm512_maskz_loadu_epi16(rem_mask, src_stream);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
const __mmask32 mask = _mm512_mask_cmpneq_epi16_mask(rem_mask, restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
max = _mm512_mask_max_epu16(max, mask, max, value);
min = _mm512_mask_min_epu16(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);
const int processed = _mm_popcnt_u32(mask);
const __mmask32 store_mask = (1U << processed) - 1;
_mm512_mask_storeu_epi16(dst_stream, store_mask, packed);
written += processed;
}
__m256i tmp256 = _mm512_extracti64x4_epi64(min, 1);
__m256i min2 = _mm512_castsi512_si256(min);
min2 = _mm256_min_epu16(min2, tmp256);
__m128i tmp = _mm256_extracti128_si256(min2, 1);
__m128i min3 = _mm256_castsi256_si128(min2);
min3 = _mm_min_epu16(min3, tmp);
tmp256 = _mm512_extracti64x4_epi64(max, 1);
__m256i max2 = _mm512_castsi512_si256(max);
max2 = _mm256_max_epu16(max2, tmp256);
tmp = _mm256_extracti128_si256(max2, 1);
__m128i max3 = _mm256_castsi256_si128(max2);
max3 = _mm_max_epu16(max3, tmp);
const u16 min_index = sse41_hmin_epu16(min3);
const u16 max_index = sse41_hmax_epu16(max3);
return std::make_tuple(min_index, max_index, written);
}
AVX3_FUNC
static
std::tuple<u32, u32, u32> upload_u32_swapped_avx3_skip_restart(const void *src, void *dst, u32 count, u32 restart_index)
{
const __m512i s_bswap_u32_mask512 = _mm512_broadcast_i32x4(s_bswap_u32_mask);
auto src_stream = static_cast<const __m512i*>(src);
auto dst_stream = static_cast<u32 *>(dst);
const __m512i restart = _mm512_set1_epi32(restart_index);
__m512i min = _mm512_set1_epi32(-1);
__m512i max = _mm512_set1_epi32(0);
const __m512i ones = _mm512_set1_epi32(-1);
int written = 0;
const u32 iterations = count / 16;
for (u32 i = 0; i < iterations; i++)
{
const __m512i raw = _mm512_loadu_si512(src_stream++);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
const __mmask16 mask = _mm512_cmpneq_epi32_mask(restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
max = _mm512_mask_max_epu32(max, mask, max, value);
min = _mm512_mask_min_epu32(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);
const int processed = _mm_popcnt_u32(mask);
_mm512_storeu_si512(dst_stream, packed);
dst_stream += processed;
written += processed;
}
u32 remainder = count % 16;
if (remainder > 0)
{
const __mmask16 rem_mask = (1U << remainder) - 1;
const __m512i raw = _mm512_maskz_loadu_epi32(rem_mask, src_stream);
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
const __mmask16 mask = _mm512_mask_cmpneq_epi32_mask(rem_mask, restart, value);
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
max = _mm512_mask_max_epu32(max, mask, max, value);
min = _mm512_mask_min_epu32(min, mask, min, value);
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);
const int processed = _mm_popcnt_u32(mask);
const __mmask16 store_mask = (1U << processed) - 1;
_mm512_mask_storeu_epi32(dst_stream, store_mask, packed);
written += processed;
}
u32 min_index = _mm512_reduce_min_epu32(min);
u32 max_index = _mm512_reduce_max_epu32(max);
return std::make_tuple(min_index, max_index, written);
}
#endif
template <typename T>
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
{
@ -412,6 +574,26 @@ NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be
u32 written = 0;
u32 length = ::size32(src);
#if defined(ARCH_X64)
if constexpr (std::is_same_v<T, u16>)
{
if (s_use_avx512_icl)
{
std::tie(min_index, max_index, written) = upload_u16_swapped_avx512_icl_skip_restart(src.data(), dst.data(), length, restart_index);
return std::make_tuple(min_index, max_index, written);
}
}
if constexpr (std::is_same_v<T, u32>)
{
if (s_use_avx3)
{
std::tie(min_index, max_index, written) = upload_u32_swapped_avx3_skip_restart(src.data(), dst.data(), length, restart_index);
return std::make_tuple(min_index, max_index, written);
}
}
#endif
for (u32 i = written; i < length; ++i)
{
T index = src[i];