From 790fd9ce14188484d68acc295c482e3aada14978 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 28 Apr 2020 17:23:43 +0300 Subject: [PATCH] SPU DMA: implement cmp_rdata_avx Use technique similar to mov_rdata_avx with inline assembly. --- rpcs3/Emu/Cell/SPUThread.cpp | 53 ++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index e11e602531..060e128a93 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -31,8 +31,55 @@ static const bool s_tsx_avx = utils::has_avx(); // For special case static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx(); +static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs) +{ +#if defined(_MSC_VER) || defined(__AVX__) + const __m256 x0 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 0)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 0))); + const __m256 x1 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 1)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 1))); + const __m256 x2 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 2)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 2))); + const __m256 x3 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 3)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 3))); + const __m256 c0 = _mm256_or_ps(x0, x1); + const __m256 c1 = _mm256_or_ps(x2, x3); + const __m256 c2 = _mm256_or_ps(c0, c1); + return _mm256_testz_si256(_mm256_castps_si256(c2), _mm256_castps_si256(c2)) != 0; +#else + bool result = 0; + __asm__( + "vmovaps 0*32(%[lhs]), %%ymm0;" // load + "vmovaps 1*32(%[lhs]), %%ymm1;" + "vmovaps 2*32(%[lhs]), %%ymm2;" + "vmovaps 3*32(%[lhs]), %%ymm3;" + "vxorps 0*32(%[rhs]), %%ymm0, %%ymm0;" // compare + "vxorps 1*32(%[rhs]), %%ymm1, %%ymm1;" + "vxorps 2*32(%[rhs]), %%ymm2, %%ymm2;" + "vxorps 3*32(%[rhs]), %%ymm3, %%ymm3;" + "vorps %%ymm0, %%ymm1, %%ymm0;" // merge + "vorps %%ymm2, %%ymm3, %%ymm2;" + "vorps %%ymm0, %%ymm2, %%ymm0;" + "vptest %%ymm0, %%ymm0;" // test + "vzeroupper" + : "=@ccz" (result) + : [lhs] "r" (lhs) + , [rhs] "r" (rhs) + : "cc" // Clobber flags + , "xmm0" // Clobber registers ymm0-ymm3 (see mov_rdata_avx) + , "xmm1" + , "xmm2" + , "xmm3" + ); + return result; +#endif +} + static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs) { +#ifndef __AVX__ + if (s_tsx_avx) [[likely]] +#endif + { + return cmp_rdata_avx(reinterpret_cast(&lhs), reinterpret_cast(&rhs)); + } + const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]); const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]); const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]); @@ -1770,7 +1817,7 @@ bool spu_thread::process_mfc_cmd() continue; } - if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data)) + if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data)) { i += 2; continue; @@ -2799,7 +2846,7 @@ bool spu_thread::stop_and_signal(u32 code) // Check group status, wait if necessary for (auto _state = +group->run_state; - _state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED; + _state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED; _state = group->run_state) { if (is_stopped()) @@ -3009,7 +3056,7 @@ bool spu_thread::stop_and_signal(u32 code) while (true) { for (auto _state = +group->run_state; - _state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED; + _state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED; _state = group->run_state) { if (is_stopped())