diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index b45cd6304a..8c77f9aaa4 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -7,6 +7,7 @@ VFS.cpp GDB.cpp title.cpp + perf_meter.cpp ) # prevent WolfSSL from warning about not having harden options diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index 2dd62b4d40..fc3e55af0b 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -8,6 +8,7 @@ #include "Emu/GDB.h" #include "Emu/Cell/PPUThread.h" #include "Emu/Cell/SPUThread.h" +#include "Emu/perf_meter.hpp" #include #include @@ -775,6 +776,9 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen if (!next) { + // Monitor the performance only of the actual suspend processing owner + perf_meter<"SUSPEND"_u64> perf0; + // First thread to push the work to the workload list pauses all threads and processes it std::lock_guard lock(ctr->cpu_suspend_lock); @@ -911,13 +915,12 @@ void cpu_thread::stop_all() noexcept std::this_thread::sleep_for(10ms); } - sys_log.notice("All CPU threads have been stopped. [+: %u; suspends: %u]", +g_threads_created, +g_suspend_counter); + sys_log.notice("All CPU threads have been stopped. [+: %u]", +g_threads_created); std::lock_guard lock(g_fxo->get()->cpu_suspend_lock); g_threads_deleted -= g_threads_created.load(); g_threads_created = 0; - g_suspend_counter = 0; } void cpu_thread::flush_profilers() noexcept diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index f781f312eb..3d34e2b90d 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -3,6 +3,7 @@ #include "Utilities/sysinfo.h" #include "Utilities/JIT.h" #include "Crypto/sha1.h" +#include "Emu/perf_meter.hpp" #include "Emu/Memory/vm_reservation.h" #include "Emu/RSX/RSXThread.h" #include "Emu/VFS.h" @@ -1282,9 +1283,9 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8); } - if (count >= 15) [[unlikely]] + if (count >= 15 && g_cfg.core.perf_report) [[unlikely]] { - ppu_log.warning("%s took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count); + perf_log.warning("%s: took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count); } ppu.rtime &= ~vm::rsrv_shared_mask; @@ -1553,6 +1554,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) { + perf_meter<"STCX"_u32> perf0; + if (addr % sizeof(T)) { fmt::throw_exception("PPU %s: Unaligned address: 0x%08x" HERE, sizeof(T) == 4 ? "STWCX" : "STDCX", addr); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 576c85871d..0ba73acd3d 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -10,6 +10,7 @@ #include "Loader/ELF.h" #include "Emu/VFS.h" #include "Emu/IdManager.h" +#include "Emu/perf_meter.hpp" #include "Emu/RSX/RSXThread.h" #include "Emu/Cell/PPUThread.h" #include "Emu/Cell/ErrorCodes.h" @@ -2055,11 +2056,16 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) bool spu_thread::do_putllc(const spu_mfc_cmd& args) { + perf_meter<"PUTLLC-"_u64> perf0; + perf_meter<"PUTLLC+"_u64> perf1 = perf0; + // Store conditionally const u32 addr = args.eal & -128; if ([&]() { + perf_meter<"PUTLLC."_u64> perf2 = perf0; + if (raddr != addr) { return false; @@ -2163,6 +2169,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) { vm::reservation_notifier(addr, 128).notify_all(); raddr = 0; + perf0.reset(); return true; } else @@ -2182,12 +2189,15 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) } raddr = 0; + perf1.reset(); return false; } } void do_cell_atomic_128_store(u32 addr, const void* to_write) { + perf_meter<"STORE128"_u64> perf0; + const auto cpu = get_current_cpu_thread(); if (g_use_rtm) [[likely]] @@ -2237,6 +2247,8 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) void spu_thread::do_putlluc(const spu_mfc_cmd& args) { + perf_meter<"PUTLLUC"_u64> perf0; + const u32 addr = args.eal & -128; if (raddr && addr == raddr) @@ -2419,6 +2431,8 @@ bool spu_thread::process_mfc_cmd() std::this_thread::yield(); } + perf_meter<"GETLLAR"_u64> perf0; + alignas(64) spu_rdata_t temp; u64 ntime; @@ -2506,9 +2520,9 @@ bool spu_thread::process_mfc_cmd() continue; } - if (g_use_rtm && i >= 15) [[unlikely]] + if (g_use_rtm && i >= 15 && g_cfg.core.perf_report) [[unlikely]] { - spu_log.warning("GETLLAR took too long: %u", i); + perf_log.warning("GETLLAR: took too long: %u", i); } break; diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index b1b5ca363b..f0a59a6963 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -14,7 +14,7 @@ #include "Emu/Cell/lv2/sys_memory.h" #include "Emu/RSX/GSRender.h" #include "Emu/Cell/SPURecompiler.h" -#include +#include "Emu/perf_meter.hpp" #include #include @@ -391,6 +391,8 @@ namespace vm if (addr >= 0x10000) { + perf_meter<"SUSPEND"_u64> perf0; + for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++) { if (auto ptr = +*lock; ptr && !(ptr->state & cpu_flag::memory)) diff --git a/rpcs3/Emu/perf_meter.cpp b/rpcs3/Emu/perf_meter.cpp new file mode 100644 index 0000000000..1e42e1655e --- /dev/null +++ b/rpcs3/Emu/perf_meter.cpp @@ -0,0 +1,58 @@ +#include "stdafx.h" +#include "perf_meter.hpp" + +void perf_stat_base::push(u64 ns[66]) noexcept +{ + for (u32 i = 0; i < 66; i++) + { + m_log[i] += ns[i]; + } +} + +void perf_stat_base::print(const char* name) noexcept +{ + if (u64 num_total = m_log[0].load()) + { + perf_log.notice("Perf stats for %s: total events: %u (total time %.4fs)", name, num_total, m_log[65].load() / 1000'000'000.); + + for (u32 i = 0; i < 13; i++) + { + if (u64 count = m_log[i + 1].load()) + { + perf_log.notice("Perf stats for %s: events < %.3fµs: %u", name, std::pow(2., i) / 1000., count); + } + } + + for (u32 i = 14; i < 23; i++) + { + if (u64 count = m_log[i + 1].load()) [[unlikely]] + { + perf_log.notice("Perf stats for %s: events < %.3fms: %u", name, std::pow(2., i) / 1000'000., count); + } + } + + for (u32 i = 24; i < 33; i++) + { + if (u64 count = m_log[i + 1].load()) [[unlikely]] + { + perf_log.notice("Perf stats for %s: events < %.3fs: %u", name, std::pow(2., i) / 1000'000'000., count); + } + } + + for (u32 i = 34; i < 43; i++) + { + if (u64 count = m_log[i + 1].load()) [[unlikely]] + { + perf_log.notice("Perf stats for %s: events < %.0f SEC: %u", name, std::pow(2., i) / 1000'000'000., count); + } + } + + for (u32 i = 44; i < 63; i++) + { + if (u64 count = m_log[i + 1].load()) [[unlikely]] + { + perf_log.notice("Perf stats for %s: events < %.0f MIN: %u", name, std::pow(2., i) / 60'000'000'000., count); + } + } + } +} diff --git a/rpcs3/Emu/perf_meter.hpp b/rpcs3/Emu/perf_meter.hpp new file mode 100644 index 0000000000..588ceecf5e --- /dev/null +++ b/rpcs3/Emu/perf_meter.hpp @@ -0,0 +1,181 @@ +#pragma once + +#include "Utilities/types.h" +#include "util/logs.hpp" +#include "Utilities/sysinfo.h" +#include "system_config.h" +#include "IdManager.h" +#include +#include +#include +#include + +LOG_CHANNEL(perf_log, "PERF"); + +// TODO: constexpr with the help of bitcast +template +inline const auto perf_name = [] +{ + constexpr auto short_name = Name; + std::array result{}; + std::memcpy(result.data(), &short_name, sizeof(Name)); + return result; +}(); + +class perf_stat_base +{ + atomic_t m_log[66]{}; + +protected: + // Print accumulated values + void print(const char* name) noexcept; + + // Accumulate values from a thread + void push(u64 ns[66]) noexcept; + +public: + perf_stat_base() noexcept = default; + + perf_stat_base(const perf_stat_base&) = delete; + + perf_stat_base& operator =(const perf_stat_base&) = delete; + + ~perf_stat_base() {} +}; + +// Object that prints event length stats at the end +template +class perf_stat final : public perf_stat_base +{ + static inline thread_local struct perf_stat_local + { + // Local non-atomic values for increments + u64 m_log[66]{}; + + ~perf_stat_local() + { + // Update on thread exit + g_fxo->get()->perf_stat_base::push(m_log); + } + } g_tls_perf_stat; + +public: + ~perf_stat() + { + perf_stat_base::print(perf_name.data()); + } + + void push(u64 ns) noexcept + { + auto& data = g_tls_perf_stat.m_log; + data[0] += ns != 0; + data[64 - std::countl_zero(ns)]++; + data[65] += ns; + } +}; + +// Object that prints event length at the end +template +class perf_meter +{ + // Initialize array (possibly only 1 element) with timestamp + u64 m_timestamps[1 + sizeof...(SubEvents)] = {__rdtsc()}; + +public: + SAFE_BUFFERS perf_meter() noexcept + { + m_timestamps[0] = __rdtsc(); + std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64)); + } + + // Copy first timestamp + template + SAFE_BUFFERS perf_meter(const perf_meter& r) noexcept + { + m_timestamps[0] = r.get(); + std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64)); + } + + template + SAFE_BUFFERS perf_meter(perf_meter&& r) noexcept + { + m_timestamps[0] = r.get(); + r.reset(); + } + + // Copy first timestamp + template + SAFE_BUFFERS perf_meter& operator =(const perf_meter& r) noexcept + { + m_timestamps[0] = r.get(); + return *this; + } + + template + SAFE_BUFFERS perf_meter& operator =(perf_meter& r) noexcept + { + m_timestamps[0] = r.get(); + r.reset(); + return *this; + } + + // Push subevent data in array + template + SAFE_BUFFERS void push() noexcept + { + // TODO: should use more efficient search with type comparison, then value comparison, or pattern matching + if constexpr (std::array{(SubEvents == Event)...}[Index]) + { + // Push actual timestamp into an array + m_timestamps[Index + 1] = __rdtsc(); + } + else if constexpr (Index < sizeof...(SubEvents)) + { + // Proceed search recursively + push(); + } + } + + // Obtain initial timestamp + u64 get() const noexcept + { + return m_timestamps[0]; + } + + // Disable this counter + SAFE_BUFFERS void reset() noexcept + { + m_timestamps[0] = 0; + } + + SAFE_BUFFERS ~perf_meter() + { + // Disabled counter + if (!m_timestamps[0]) [[unlikely]] + { + return; + } + + if (!g_cfg.core.perf_report) [[likely]] + { + return; + } + + // Event end + const u64 end_time = __rdtsc(); + + // Compute difference in seconds + const f64 diff = (end_time - m_timestamps[0]) * 1. / utils::get_tsc_freq(); + + // Register perf stat in nanoseconds + g_fxo->get>()->push(static_cast(diff * 1000'000'000.)); + + // Print in microseconds + if (static_cast(diff * 1000'000.) >= g_cfg.core.perf_report_threshold) + { + perf_log.notice("%s: %.3fµs", perf_name.data(), diff * 1000'000.); + } + + // TODO: handle push(), currently ignored + } +}; diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index f1a5fd84fe..94bce51bdd 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -53,7 +53,7 @@ struct cfg_root : cfg::node cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false }; cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true }; cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively - cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM + cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false }; cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) @@ -73,6 +73,9 @@ struct cfg_root : cfg::node #else sleep_timers_accuracy_level::_usleep, true }; #endif + + cfg::uint64 perf_report_threshold{this, "Performance Report Threshold", 500, true}; // In µs, 0.5ms = default, 0 = everything + cfg::_bool perf_report{this, "Enable Performance Report", false, true}; // Show certain perf-related logs } core{ this }; struct node_vfs : cfg::node diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index a9afa27245..ba5808de54 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -107,6 +107,7 @@ + @@ -758,6 +759,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index bdca6947c5..bc413b4114 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -971,6 +971,9 @@ Emu + + Emu + @@ -1867,6 +1870,9 @@ Emu + + Emu +