From d4cf12bc17d1eb0c59a356f17034f0e892b7a201 Mon Sep 17 00:00:00 2001 From: Whatcookie Date: Sat, 5 Aug 2023 04:49:30 -0400 Subject: [PATCH] LV2: Improve sys_timer_usleep by using CPU usermode waiting * Linux: set timerslack to minimum value - Linux delays the wakeup of threads to save power, this feature isn't needed for this application * Utils: Add detection for waitpkg and monitorx extensions - These instructions are used for user mode wait instructions * lv2: Use user mode wait instructions instead of yielding when appropriate --- rpcs3/Emu/Cell/lv2/lv2.cpp | 56 ++++++++++++++++++++++++++++++++++++++ rpcs3/main.cpp | 5 ++++ rpcs3/util/sysinfo.cpp | 35 +++++++++++++++++++++++- rpcs3/util/sysinfo.hpp | 6 ++++ 4 files changed, 101 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/Cell/lv2/lv2.cpp b/rpcs3/Emu/Cell/lv2/lv2.cpp index 9b2473ec2b..b265c83dfe 100644 --- a/rpcs3/Emu/Cell/lv2/lv2.cpp +++ b/rpcs3/Emu/Cell/lv2/lv2.cpp @@ -53,6 +53,17 @@ #include #include #include "util/tsc.hpp" +#include "util/sysinfo.hpp" + +#if defined(ARCH_X64) +#ifdef _MSC_VER +#include +#include +#else +#include +#endif +#endif + extern std::string ppu_get_syscall_name(u64 code); @@ -1880,6 +1891,35 @@ void lv2_obj::set_yield_frequency(u64 freq, u64 max_allowed_tsc) g_lv2_preempts_taken.release(0); } +#if defined(_MSC_VER) +#define mwaitx_func +#define waitpkg_func +#else +#define mwaitx_func __attribute__((__target__("mwaitx"))) +#define waitpkg_func __attribute__((__target__("waitpkg"))) +#endif + +#if defined(ARCH_X64) +// Waits for a number of TSC clock cycles in power optimized state +// Cstate is represented in bits [7:4]+1 cstate. So C0 requires bits [7:4] to be set to 0xf, C1 requires bits [7:4] to be set to 0. +mwaitx_func static void __mwaitx(u32 cycles, u32 cstate) +{ + constexpr u32 timer_enable = 0x2; + + // monitorx will wake if the cache line is written to. We don't want this, so place the monitor value on it's own cache line. + alignas(64) u64 monitor_var{}; + _mm_monitorx(&monitor_var, 0, 0); + _mm_mwaitx(timer_enable, cstate, cycles); +} + +// First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 state (higher power) +waitpkg_func static void __tpause(u32 cycles, u32 cstate) +{ + const u64 tsc = utils::get_tsc() + cycles; + _tpause(cstate, tsc); +} +#endif + bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep) { static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling"); @@ -1965,6 +2005,7 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep if (remaining > host_min_quantum) { #ifdef __linux__ + // With timerslack set low, Linux is precise for all values above wait_for(remaining); #else // Wait on multiple of min quantum for large durations to avoid overloading low thread cpus @@ -1972,6 +2013,21 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep #endif } // TODO: Determine best value for yield delay +#if defined(ARCH_X64) + else if (utils::has_appropriate_um_wait()) + { + u32 us_in_tsc_clocks = remaining * (utils::get_tsc_freq() / 1000000); + + if (utils::has_waitpkg()) + { + __tpause(us_in_tsc_clocks, 0x1); + } + else + { + __mwaitx(us_in_tsc_clocks, 0xf0); + } + } +#endif else { // Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure diff --git a/rpcs3/main.cpp b/rpcs3/main.cpp index 5354af4e69..3dda91b80a 100644 --- a/rpcs3/main.cpp +++ b/rpcs3/main.cpp @@ -1049,6 +1049,11 @@ int main(int argc, char** argv) } } +// Set timerslack value for Linux. The default value is 50,000ns. Change this to just 1 since we value precise timers. +#ifdef __linux__ + prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0); +#endif + #ifdef _WIN32 // Create dummy permanent low resolution timer to workaround messing with system timer resolution QTimer* dummy_timer = new QTimer(app.data()); diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index af44d1df50..24a264a32f 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -298,7 +298,7 @@ bool utils::has_fma4() bool utils::has_fast_vperm2b() { #if defined(ARCH_X64) - static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20) == 0x20; + static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x40) == 0x40; return g_value; #else return false; @@ -325,6 +325,39 @@ bool utils::has_fsrm() #endif } +bool utils::has_waitx() +{ +#if defined(ARCH_X64) + static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20000000) == 0x20000000; + return g_value; +#else + return false; +#endif +} + +bool utils::has_waitpkg() +{ +#if defined(ARCH_X64) + static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[2] & 0x20) == 0x20; + return g_value; +#else + return false; +#endif +} + +// User mode waits may be unfriendly to low thread CPUs +// Filter out systems with less than 8 threads for linux and less than 12 threads for other platforms +bool utils::has_appropriate_um_wait() +{ +#ifdef __linux__ + static const bool g_value = (has_waitx() || has_waitpkg()) && (get_thread_count() >= 8) && get_tsc_freq(); + return g_value; +#else + static const bool g_value = (has_waitx() || has_waitpkg()) && (get_thread_count() >= 12) && get_tsc_freq(); + return g_value; +#endif +} + u32 utils::get_rep_movsb_threshold() { static const u32 g_value = []() diff --git a/rpcs3/util/sysinfo.hpp b/rpcs3/util/sysinfo.hpp index 39826af5a8..2ce46b7c6c 100755 --- a/rpcs3/util/sysinfo.hpp +++ b/rpcs3/util/sysinfo.hpp @@ -53,6 +53,12 @@ namespace utils bool has_fsrm(); + bool has_waitx(); + + bool has_waitpkg(); + + bool has_appropriate_um_wait(); + std::string get_cpu_brand(); std::string get_system_info();