From 6adc7f9ee65177e6f99ebfcde9044088e2a28076 Mon Sep 17 00:00:00 2001 From: Eladash Date: Wed, 16 Aug 2023 16:16:49 +0300 Subject: [PATCH] SPU: Use usermode waiting for busy GETLLAR loop --- rpcs3/Emu/Cell/SPUThread.cpp | 70 +++++++++++++++++++++++++++++++++++- rpcs3/util/sysinfo.cpp | 7 ++++ rpcs3/util/sysinfo.hpp | 2 ++ 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 4fdd72f143..64747776b7 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -37,6 +37,15 @@ #include "util/sysinfo.hpp" #include "util/serialization.hpp" +#if defined(ARCH_X64) +#ifdef _MSC_VER +#include +#include +#else +#include +#endif +#endif + using spu_rdata_t = decltype(spu_thread::rdata); template <> @@ -320,6 +329,40 @@ extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src) #endif } +#if defined(_MSC_VER) +#define mwaitx_func +#define waitpkg_func +#else +#define mwaitx_func __attribute__((__target__("mwaitx"))) +#define waitpkg_func __attribute__((__target__("waitpkg"))) +#endif + +#if defined(ARCH_X64) +// Waits for a number of TSC clock cycles in power optimized state +// Cstate is represented in bits [7:4]+1 cstate. So C0 requires bits [7:4] to be set to 0xf, C1 requires bits [7:4] to be set to 0. +template +mwaitx_func static void __mwaitx(u32 cycles, u32 cstate, const void* cline, const Args&... args) +{ + constexpr u32 timer_enable = 0x2; + + // monitorx will wake if the cache line is written to, use it for reservations which fits it almost perfectly + _mm_monitorx(const_cast(cline), 0, 0); + + // Use static function to force inline + if (T::needs_wait(args...)) + { + _mm_mwaitx(timer_enable, cstate, cycles); + } +} + +// First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 state (higher power) +waitpkg_func static void __tpause(u32 cycles, u32 cstate) +{ + const u64 tsc = utils::get_tsc() + cycles; + _tpause(cstate, tsc); +} +#endif + void do_cell_atomic_128_store(u32 addr, const void* to_write); extern thread_local u64 g_tls_fault_spu; @@ -4113,7 +4156,32 @@ bool spu_thread::process_mfc_cmd() if (getllar_busy_waiting_switch == 1) { - busy_wait(300); +#if defined(ARCH_X64) + if (utils::has_um_wait()) + { + if (utils::has_waitpkg()) + { + __tpause(std::min(getllar_spin_count, 10) * 500, 0x1); + } + else + { + struct check_wait_t + { + static FORCE_INLINE bool needs_wait(u64 rtime, const atomic_t& mem_rtime) noexcept + { + return rtime == mem_rtime; + } + }; + + // Provide the first X64 cache line of the reservation to be tracked + __mwaitx(std::min(getllar_spin_count, 17) * 500, 0xf0, std::addressof(data), +rtime, vm::reservation_acquire(addr)); + } + } + else +#endif + { + busy_wait(300); + } } return true; diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index 24a264a32f..108450ab30 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -358,6 +358,13 @@ bool utils::has_appropriate_um_wait() #endif } +// Similar to the above function but allow execution if alternatives such as yield are not wanted +bool utils::has_um_wait() +{ + static const bool g_value = (has_waitx() || has_waitpkg()) && get_tsc_freq(); + return g_value; +} + u32 utils::get_rep_movsb_threshold() { static const u32 g_value = []() diff --git a/rpcs3/util/sysinfo.hpp b/rpcs3/util/sysinfo.hpp index 2ce46b7c6c..d7c7c2ec0e 100755 --- a/rpcs3/util/sysinfo.hpp +++ b/rpcs3/util/sysinfo.hpp @@ -59,6 +59,8 @@ namespace utils bool has_appropriate_um_wait(); + bool has_um_wait(); + std::string get_cpu_brand(); std::string get_system_info();