mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-20 19:45:20 +00:00
SPU DMA: more tuning for mov_rdata_avx
Avoid unaligned stores. Prefer asm path if __AVX2__ is not set. Don't emit vzeroupper if __AVX__ is set.
This commit is contained in:
parent
4f71c570bd
commit
3ec73b651e
2 changed files with 45 additions and 8 deletions
|
@ -43,11 +43,12 @@ static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const
|
|||
|
||||
static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
|
||||
{
|
||||
#if defined(_MSC_VER) || defined(__AVX__)
|
||||
_mm256_storeu_si256(dst + 0, _mm256_loadu_si256(src + 0));
|
||||
_mm256_storeu_si256(dst + 1, _mm256_loadu_si256(src + 1));
|
||||
_mm256_storeu_si256(dst + 2, _mm256_loadu_si256(src + 2));
|
||||
_mm256_storeu_si256(dst + 3, _mm256_loadu_si256(src + 3));
|
||||
#if defined(_MSC_VER) || defined(__AVX2__)
|
||||
// In AVX-only mode, for some older CPU models, GCC/Clang may emit 128-bit loads/stores instead.
|
||||
_mm256_store_si256(dst + 0, _mm256_loadu_si256(src + 0));
|
||||
_mm256_store_si256(dst + 1, _mm256_loadu_si256(src + 1));
|
||||
_mm256_store_si256(dst + 2, _mm256_loadu_si256(src + 2));
|
||||
_mm256_store_si256(dst + 3, _mm256_loadu_si256(src + 3));
|
||||
#else
|
||||
__asm__(
|
||||
"vmovdqu 0*32(%[src]), %%ymm0;" // load
|
||||
|
@ -58,11 +59,17 @@ static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
|
|||
"vmovdqu %%ymm0, 2*32(%[dst]);"
|
||||
"vmovdqu 3*32(%[src]), %%ymm0;"
|
||||
"vmovdqu %%ymm0, 3*32(%[dst]);"
|
||||
"vzeroupper"
|
||||
#ifndef __AVX__
|
||||
"vzeroupper" // Don't need in AVX mode (should be emitted automatically)
|
||||
#endif
|
||||
:
|
||||
: [src] "r" (src)
|
||||
, [dst] "r" (dst)
|
||||
: "xmm0"
|
||||
#ifdef __AVX__
|
||||
: "ymm0" // Clobber ymm0 register (acknowledge its modification)
|
||||
#else
|
||||
: "xmm0" // ymm0 is "unknown" if not compiled in AVX mode, so clobber xmm0 only
|
||||
#endif
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
@ -1512,6 +1519,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
|
|||
// Split locking + transfer in two parts (before 64K border, and after it)
|
||||
const auto lock = vm::range_lock(range_addr, nexta);
|
||||
|
||||
// Avoid unaligned stores in mov_rdata_avx
|
||||
if (reinterpret_cast<u64>(dst) & 0x10)
|
||||
{
|
||||
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
|
||||
|
||||
dst += 16;
|
||||
src += 16;
|
||||
size0 -= 16;
|
||||
}
|
||||
|
||||
while (size0 >= 128)
|
||||
{
|
||||
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
|
||||
|
@ -1536,6 +1553,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
|
|||
|
||||
const auto lock = vm::range_lock(range_addr, range_end);
|
||||
|
||||
// Avoid unaligned stores in mov_rdata_avx
|
||||
if (reinterpret_cast<u64>(dst) & 0x10)
|
||||
{
|
||||
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
|
||||
|
||||
dst += 16;
|
||||
src += 16;
|
||||
size -= 16;
|
||||
}
|
||||
|
||||
while (size >= 128)
|
||||
{
|
||||
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
|
||||
|
@ -1586,6 +1613,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
|
|||
}
|
||||
default:
|
||||
{
|
||||
// Avoid unaligned stores in mov_rdata_avx
|
||||
if (reinterpret_cast<u64>(dst) & 0x10)
|
||||
{
|
||||
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
|
||||
|
||||
dst += 16;
|
||||
src += 16;
|
||||
size -= 16;
|
||||
}
|
||||
|
||||
while (size >= 128)
|
||||
{
|
||||
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
|
||||
|
|
|
@ -556,7 +556,7 @@ public:
|
|||
|
||||
// Reservation Data
|
||||
u64 rtime = 0;
|
||||
std::array<v128, 8> rdata{};
|
||||
alignas(64) std::array<v128, 8> rdata{};
|
||||
u32 raddr = 0;
|
||||
|
||||
u32 srr0;
|
||||
|
|
Loading…
Add table
Reference in a new issue