diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index b99b437adf..7f7a56ad3d 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -889,7 +889,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen } }); - while (std::accumulate(std::begin(ctr->cpu_copy_bits), std::end(ctr->cpu_copy_bits), u64{0}, std::bit_or())) + while (true) { // Check only CPUs which haven't acknowledged their waiting state yet for_all_cpu([&](cpu_thread* cpu, u64 index) @@ -900,6 +900,11 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen } }); + if (!std::accumulate(std::begin(ctr->cpu_copy_bits), std::end(ctr->cpu_copy_bits), u64{0}, std::bit_or())) + { + break; + } + _mm_pause(); } @@ -927,13 +932,20 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen while (prev); } + // Execute prefetch hint(s) + for (auto work = head; work; work = work->next) + { + for (u32 i = 0; i < work->prf_size; i++) + { + _m_prefetchw(work->prf_list[0]); + } + } + for_all_cpu([&](cpu_thread* cpu) { _m_prefetchw(&cpu->state); }); - _m_prefetchw(&g_suspend_counter); - // Execute all stored workload for (s32 prio = max_prio; prio >= min_prio; prio--) { @@ -948,6 +960,9 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen } } + // Not sure if needed, may be overkill. Some workloads may execute instructions with non-temporal hint. + _mm_sfence(); + // Finalization g_suspend_counter++; diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h index 873c970714..cfe76c9539 100644 --- a/rpcs3/Emu/CPU/CPUThread.h +++ b/rpcs3/Emu/CPU/CPUThread.h @@ -127,6 +127,10 @@ public: // Task priority s8 prio; + // Size of prefetch list workload + u32 prf_size; + void* const* prf_list; + void* func_ptr; void* res_buf; @@ -142,11 +146,11 @@ public: // Suspend all threads and execute op (may be executed by other thread than caller!) template - static auto suspend_all(cpu_thread* _this, F op) + static auto suspend_all(cpu_thread* _this, std::initializer_list hints, F op) { if constexpr (std::is_void_v>) { - suspend_work work{Prio, &op, nullptr, [](void* func, void*) + suspend_work work{Prio, ::size32(hints), hints.begin(), &op, nullptr, [](void* func, void*) { std::invoke(*static_cast(func)); }}; @@ -158,7 +162,7 @@ public: { std::invoke_result_t result; - suspend_work work{Prio, &op, &result, [](void* func, void* res_buf) + suspend_work work{Prio, ::size32(hints), hints.begin(), &op, &result, [](void* func, void* res_buf) { *static_cast*>(res_buf) = std::invoke(*static_cast(func)); }}; @@ -170,11 +174,11 @@ public: // Push the workload only if threads are being suspended by suspend_all() template - static bool if_suspended(cpu_thread* _this, F op) + static bool if_suspended(cpu_thread* _this, std::initializer_list hints, F op) { static_assert(std::is_void_v>, "Unimplemented (must return void)"); { - suspend_work work{Prio, &op, nullptr, [](void* func, void*) + suspend_work work{Prio, ::size32(hints), hints.begin(), &op, nullptr, [](void* func, void*) { std::invoke(*static_cast(func)); }}; diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index fed1a8fafd..9b99cb2f80 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -78,6 +78,7 @@ extern atomic_t g_progr_pdone; using spu_rdata_t = decltype(ppu_thread::rdata); extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src); +extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src); extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs); extern u32(*const spu_getllar_tx)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime); @@ -1234,13 +1235,17 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) { if (ppu.state & cpu_flag::pause) { - verify(HERE), cpu_thread::if_suspended<-1>(&ppu, [&]() + auto& sdata = *vm::get_super_ptr(addr & -128); + + verify(HERE), cpu_thread::if_suspended<-1>(&ppu, {}, [&]() { // Guaranteed success ppu.rtime = vm::reservation_acquire(addr, sizeof(T)); - mov_rdata(ppu.rdata, *vm::get_super_ptr(addr & -128)); + mov_rdata_nt(ppu.rdata, sdata); }); + _mm_mfence(); + // Exit loop if ((ppu.rtime & 127) == 0) { @@ -1724,18 +1729,19 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) { case UINT32_MAX: { - const bool ok = cpu_thread::suspend_all<+1>(&ppu, [&] - { - auto& all_data = *vm::get_super_ptr(addr & -128); + auto& all_data = *vm::get_super_ptr(addr & -128); + auto& sdata = *vm::get_super_ptr>(addr & -8); + const bool ok = cpu_thread::suspend_all<+1>(&ppu, {all_data, all_data + 64, &res}, [&] + { if ((res & -128) == rtime && cmp_rdata(ppu.rdata, all_data)) { - data.release(new_data); + sdata.release(new_data); res += 127; return true; } - mov_rdata(ppu.rdata, all_data); + mov_rdata_nt(ppu.rdata, all_data); res -= 1; return false; }); @@ -1754,6 +1760,8 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) ppu.last_fail++; } + _m_prefetchw(ppu.rdata); + _m_prefetchw(ppu.rdata + 64); ppu.last_faddr = addr; ppu.last_ftime = res.load() & -128; ppu.last_ftsc = __rdtsc(); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 552c9b5b14..7effe13e9c 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -220,6 +220,69 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src) _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 112), v3); } +static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src) +{ +#ifdef _MSC_VER + _mm256_stream_si256(dst + 0, _mm256_load_si256(src + 0)); + _mm256_stream_si256(dst + 1, _mm256_load_si256(src + 1)); + _mm256_stream_si256(dst + 2, _mm256_load_si256(src + 2)); + _mm256_stream_si256(dst + 3, _mm256_load_si256(src + 3)); +#else + __asm__( + "vmovdqa 0*32(%[src]), %%ymm0;" // load + "vmovntdq %%ymm0, 0*32(%[dst]);" // store + "vmovdqa 1*32(%[src]), %%ymm0;" + "vmovntdq %%ymm0, 1*32(%[dst]);" + "vmovdqa 2*32(%[src]), %%ymm0;" + "vmovntdq %%ymm0, 2*32(%[dst]);" + "vmovdqa 3*32(%[src]), %%ymm0;" + "vmovntdq %%ymm0, 3*32(%[dst]);" +#ifndef __AVX__ + "vzeroupper" // Don't need in AVX mode (should be emitted automatically) +#endif + : + : [src] "r" (src) + , [dst] "r" (dst) +#ifdef __AVX__ + : "ymm0" // Clobber ymm0 register (acknowledge its modification) +#else + : "xmm0" // ymm0 is "unknown" if not compiled in AVX mode, so clobber xmm0 only +#endif + ); +#endif +} + +extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src) +{ +#ifndef __AVX__ + if (s_tsx_avx) [[likely]] +#endif + { + mov_rdata_nt_avx(reinterpret_cast<__m256i*>(_dst), reinterpret_cast(_src)); + return; + } + + { + const __m128i v0 = _mm_load_si128(reinterpret_cast(_src + 0)); + const __m128i v1 = _mm_load_si128(reinterpret_cast(_src + 16)); + const __m128i v2 = _mm_load_si128(reinterpret_cast(_src + 32)); + const __m128i v3 = _mm_load_si128(reinterpret_cast(_src + 48)); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 0), v0); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 16), v1); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 32), v2); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 48), v3); + } + + const __m128i v0 = _mm_load_si128(reinterpret_cast(_src + 64)); + const __m128i v1 = _mm_load_si128(reinterpret_cast(_src + 80)); + const __m128i v2 = _mm_load_si128(reinterpret_cast(_src + 96)); + const __m128i v3 = _mm_load_si128(reinterpret_cast(_src + 112)); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 64), v0); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 80), v1); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 96), v2); + _mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 112), v3); +} + extern u64 get_timebased_time(); extern u64 get_system_time(); @@ -1845,7 +1908,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* { if (_cpu->state & cpu_flag::pause) { - cpu_thread::if_suspended(_cpu, [&] + cpu_thread::if_suspended(_cpu, {dst, dst + 64, &res}, [&] { std::memcpy(dst, src, size0); res += 128; @@ -2370,10 +2433,10 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) { case UINT32_MAX: { - const bool ok = cpu_thread::suspend_all<+1>(this, [&]() - { - auto& data = *vm::get_super_ptr(addr); + auto& data = *vm::get_super_ptr(addr); + const bool ok = cpu_thread::suspend_all<+1>(this, {data, data + 64, &res}, [&]() + { if ((res & -128) == rtime) { if (cmp_rdata(rdata, data)) @@ -2385,7 +2448,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) } // Save previous data - mov_rdata(rdata, data); + mov_rdata_nt(rdata, data); res -= 1; return false; }); @@ -2404,6 +2467,8 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) last_fail++; } + _m_prefetchw(rdata); + _m_prefetchw(rdata + 64); last_faddr = addr; last_ftime = res.load() & -128; last_ftsc = __rdtsc(); @@ -2509,11 +2574,13 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) if (result == 0) { - // Execute with increased priority - cpu_thread::suspend_all<0>(cpu, [&] + auto& sdata = *vm::get_super_ptr(addr); + auto& res = vm::reservation_acquire(addr, 128); + + cpu_thread::suspend_all<0>(cpu, {&res}, [&] { - mov_rdata(vm::_ref(addr), *static_cast(to_write)); - vm::reservation_acquire(addr, 128) += 127; + mov_rdata_nt(sdata, *static_cast(to_write)); + res += 127; }); } else if (result > 60 && g_cfg.core.perf_report) [[unlikely]] @@ -2767,13 +2834,17 @@ bool spu_thread::process_mfc_cmd() { if (state & cpu_flag::pause) { - verify(HERE), cpu_thread::if_suspended<-1>(this, [&] + auto& sdata = *vm::get_super_ptr(addr); + + verify(HERE), cpu_thread::if_suspended<-1>(this, {}, [&] { // Guaranteed success ntime = vm::reservation_acquire(addr, 128); - mov_rdata(rdata, *vm::get_super_ptr(addr)); + mov_rdata_nt(rdata, sdata); }); + _mm_mfence(); + // Exit loop if ((ntime & 127) == 0) { diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 6ea1f9323a..13bb3ee37d 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -543,16 +543,18 @@ namespace vm void reservation_op_internal(u32 addr, std::function func) { - cpu_thread::suspend_all(get_current_cpu_thread(), [&] + auto& res = vm::reservation_acquire(addr, 128); + + cpu_thread::suspend_all(get_current_cpu_thread(), {&res}, [&] { if (func()) { // Success, release all locks if necessary - vm::reservation_acquire(addr, 128) += 127; + res += 127; } else { - vm::reservation_acquire(addr, 128) -= 1; + res -= 1; } }); } diff --git a/rpcs3/rpcs3qt/cheat_manager.cpp b/rpcs3/rpcs3qt/cheat_manager.cpp index 78c82ed9b1..0040f8a1e3 100644 --- a/rpcs3/rpcs3qt/cheat_manager.cpp +++ b/rpcs3/rpcs3qt/cheat_manager.cpp @@ -319,7 +319,7 @@ std::vector cheat_engine::search(const T value, const std::vector& to_ if (Emu.IsStopped()) return {}; - cpu_thread::suspend_all(nullptr, [&] + cpu_thread::suspend_all(nullptr, {}, [&] { if (!to_filter.empty()) { @@ -362,7 +362,7 @@ T cheat_engine::get_value(const u32 offset, bool& success) return 0; } - return cpu_thread::suspend_all(nullptr, [&]() -> T + return cpu_thread::suspend_all(nullptr, {}, [&]() -> T { if (!vm::check_addr(offset, sizeof(T))) { @@ -386,7 +386,7 @@ bool cheat_engine::set_value(const u32 offset, const T value) return false; } - return cpu_thread::suspend_all(nullptr, [&] + return cpu_thread::suspend_all(nullptr, {}, [&] { if (!vm::check_addr(offset, sizeof(T))) {