diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index c72c02a2ef..c2e104293e 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -9,6 +9,8 @@ #include "Emu/Cell/SPUThread.h" #include +#include +#include DECLARE(cpu_thread::g_threads_created){0}; DECLARE(cpu_thread::g_threads_deleted){0}; @@ -46,6 +48,196 @@ void fmt_class_string>::format(std::string& out, u64 arg) format_bitset(out, arg, "[", "|", "]", &fmt_class_string::format); } +// CPU profiler thread +struct cpu_prof +{ + // PPU/SPU id enqueued for registration + lf_queue registered; + + struct sample_info + { + // Weak pointer to the thread + std::weak_ptr wptr; + + // Block occurences: name -> sample_count + std::unordered_map> freq; + + // Total number of samples + u64 samples = 0, idle = 0; + + sample_info(const std::shared_ptr& ptr) + : wptr(ptr) + { + } + + void reset() + { + freq.clear(); + samples = 0; + idle = 0; + } + + // Print info + void print(u32 id) const + { + // Make reversed map: sample_count -> name + std::multimap chart; + + for (auto& [name, count] : freq) + { + // Inverse bits to sort in descending order + chart.emplace(~count, name); + } + + // Print results + std::string results; + results.reserve(5100); + + // Fraction of non-idle samples + const f64 busy = 1. * (samples - idle) / samples; + + for (auto& [rcount, name] : chart) + { + // Get correct count value + const u64 count = ~rcount; + const f64 _frac = count / busy / samples; + + // Print only 7 hash characters out of 11 (which covers roughly 48 bits) + fmt::append(results, "\n\t[%s", fmt::base57(be_t{name})); + results.resize(results.size() - 4); + + // Print chunk address from lowest 16 bits + fmt::append(results, "...chunk-0x%05x]: %.4f%% (%u)", (name & 0xffff) * 4, _frac * 100., count); + + if (results.size() >= 5000) + { + // Stop printing after reaching some arbitrary limit in characters + break; + } + } + + LOG_NOTICE(GENERAL, "Thread [0x%08x]: %u samples (%.4f%% idle):%s", id, samples, 100. * idle / samples, results); + } + }; + + void operator()() + { + std::unordered_map> threads; + + while (thread_ctrl::state() != thread_state::aborting) + { + bool flush = false; + + // Handle registration channel + for (u32 id : registered.pop_all()) + { + if (id == 0) + { + // Handle id zero as a command to flush results + flush = true; + continue; + } + + std::shared_ptr ptr; + + if (id >> 24 == 1) + { + ptr = idm::get>(id); + } + else if (id >> 24 == 2) + { + ptr = idm::get>(id); + } + else + { + LOG_FATAL(GENERAL, "Invalid Thread ID: 0x%08x", id); + continue; + } + + if (ptr) + { + auto [found, add] = threads.try_emplace(id, ptr); + + if (!add) + { + // Overwritten: print previous data + found->second.print(id); + found->second.reset(); + found->second.wptr = ptr; + } + } + } + + if (threads.empty()) + { + // Wait for messages if no work (don't waste CPU) + registered.wait(); + continue; + } + + // Sample active threads + for (auto& [id, info] : threads) + { + if (auto ptr = info.wptr.lock()) + { + // Get short function hash + const u64 name = atomic_storage::load(ptr->block_hash); + + // Append occurrence + info.samples++; + + if (!(ptr->state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause))) + { + info.freq[name]++; + + // Append verification time to fixed common name 0000000...chunk-0x3fffc + if ((name & 0xffff) == 0) + info.freq[0xffff]++; + } + else + { + info.idle++; + } + } + } + + // Cleanup and print results for deleted threads + for (auto it = threads.begin(), end = threads.end(); it != end;) + { + if (it->second.wptr.expired()) + it->second.print(it->first), it = threads.erase(it); + else + it++; + } + + if (flush) + { + LOG_SUCCESS(GENERAL, "Flushing profiling results..."); + + // Print all results and cleanup + for (auto& [id, info] : threads) + { + info.print(id); + info.reset(); + } + } + + // Wait, roughly for 20µs + thread_ctrl::wait_for(20, false); + } + + // Print all remaining results + for (auto& [id, info] : threads) + { + info.print(id); + } + } + + static constexpr auto thread_name = "CPU Profiler"sv; +}; + +using cpu_profiler = named_thread; + thread_local cpu_thread* g_tls_current_cpu_thread = nullptr; // For synchronizing suspend_all operation @@ -91,6 +283,16 @@ void cpu_thread::operator()() thread_ctrl::set_native_priority(-1); } + if (id_type() == 1 && false) + { + g_fxo->get()->registered.push(id); + } + + if (id_type() == 2 && g_cfg.core.spu_prof) + { + g_fxo->get()->registered.push(id); + } + // Register thread in g_cpu_array if (!g_cpu_array_sema.try_inc(sizeof(g_cpu_array_bits) * 8)) { @@ -418,5 +620,22 @@ void cpu_thread::stop_all() noexcept std::this_thread::sleep_for(10ms); } + // Workaround for remaining threads (TODO) + std::this_thread::sleep_for(1300ms); + LOG_NOTICE(GENERAL, "All CPU threads have been stopped."); } + +void cpu_thread::flush_profilers() noexcept +{ + if (!g_fxo->get()) + { + LOG_FATAL(GENERAL, "cpu_thread::flush_profilers() has been called incorrectly." HERE); + return; + } + + if (g_cfg.core.spu_prof || false) + { + g_fxo->get()->registered.push(0); + } +} diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h index 8483bb8031..2a3bf0b397 100644 --- a/rpcs3/Emu/CPU/CPUThread.h +++ b/rpcs3/Emu/CPU/CPUThread.h @@ -27,7 +27,10 @@ enum class cpu_flag : u32 class cpu_thread { // PPU cache backward compatibility hack - char dummy[sizeof(std::shared_ptr)]; + char dummy[sizeof(std::shared_ptr) - 8]; + +public: + u64 block_hash = 0; protected: cpu_thread(u32 id); @@ -119,6 +122,9 @@ public: // Stop all threads with cpu_flag::dbg_global_stop static void stop_all() noexcept; + + // Send signal to the profiler(s) to flush results + static void flush_profilers() noexcept; }; inline cpu_thread* get_current_cpu_thread() noexcept diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index cae580801d..168b484b70 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -10,6 +10,7 @@ #include "Utilities/sysinfo.h" #include "Utilities/asm.h" #include "PPUAnalyser.h" +#include "Crypto/sha1.h" #include #include @@ -63,6 +64,19 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectoradd(func); } + { + sha1_context ctx; + u8 output[20]; + + sha1_starts(&ctx); + sha1_update(&ctx, reinterpret_cast(func.data() + 1), func.size() * 4 - 4); + sha1_finish(&ctx, output); + + be_t hash_start; + std::memcpy(&hash_start, output, sizeof(hash_start)); + m_hash_start = hash_start; + } + using namespace asmjit; StringLogger logger; @@ -160,6 +174,12 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectorcmp(SPU_OFF_32(state), 0); c->jnz(label_stop); + if (g_cfg.core.spu_prof && g_cfg.core.spu_verification) + { + c->mov(x86::rax, m_hash_start & -0xffff); + c->mov(SPU_OFF_64(block_hash), x86::rax); + } + if (utils::has_avx()) { // How to check dirty AVX state @@ -722,6 +742,13 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vectoradd(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4)); + // Set block hash for profiling (if enabled) + if (g_cfg.core.spu_prof) + { + c->mov(x86::rax, m_hash_start | 0xffff); + c->mov(SPU_OFF_64(block_hash), x86::rax); + } + if (m_pos != start) { // Jump to the entry point if necessary @@ -1159,6 +1186,14 @@ void spu_recompiler::branch_set_link(u32 target) c->and_(qw1->r32(), 0x3fff0); c->pcmpeqd(x86::xmm0, x86::xmm0); c->movdqa(x86::dqword_ptr(*cpu, *qw1, 0, ::offset32(&spu_thread::stack_mirror)), x86::xmm0); + + // Set block hash for profiling (if enabled) + if (g_cfg.core.spu_prof) + { + c->mov(x86::rax, m_hash_start | 0xffff); + c->mov(SPU_OFF_64(block_hash), x86::rax); + } + c->jmp(target); }); } diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index f633dc38be..1b8c300177 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -137,6 +137,16 @@ DECLARE(spu_runtime::tr_all) = [] std::memcpy(raw, &r32, 4); raw += 4; + // Update block_hash (set zero): mov [r13 + spu_thread::m_block_hash], 0 + *raw++ = 0x49; + *raw++ = 0xc7; + *raw++ = 0x45; + *raw++ = ::narrow(::offset32(&spu_thread::block_hash)); + *raw++ = 0x00; + *raw++ = 0x00; + *raw++ = 0x00; + *raw++ = 0x00; + // jmp [rdx + rax * 8] *raw++ = 0xff; *raw++ = 0x24; @@ -4257,6 +4267,10 @@ public: m_hash.clear(); fmt::append(m_hash, "spu-0x%05x-%s", func[0], fmt::base57(output)); + + be_t hash_start; + std::memcpy(&hash_start, output, sizeof(hash_start)); + m_hash_start = hash_start; } if (g_fxo->get()) @@ -4318,6 +4332,10 @@ public: u32 check_iterations = 0; m_ir->SetInsertPoint(label_test); + // Set block hash for profiling (if enabled) + if (g_cfg.core.spu_prof && g_cfg.core.spu_verification) + m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr(&spu_thread::block_hash), true); + if (!g_cfg.core.spu_verification) { // Disable check (unsafe) @@ -4507,6 +4525,11 @@ public: // Initialize function info m_entry = m_function_queue[fi]; set_function(m_functions[m_entry].chunk); + + // Set block hash for profiling (if enabled) + if (g_cfg.core.spu_prof) + m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr(&spu_thread::block_hash), true); + m_finfo = &m_functions[m_entry]; m_ir->CreateBr(add_block(m_entry)); diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 8b73fa9903..de1c06ee04 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -241,6 +241,7 @@ protected: u32 m_pos; u32 m_size; + u64 m_hash_start; // Bit indicating start of the block std::bitset<0x10000> m_block_info; diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 5994ce72b9..f3a73e4e32 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -1677,6 +1677,9 @@ bool Emulator::Pause() return m_state.compare_and_swap_test(system_state::ready, system_state::paused); } + // Signal profilers to print results (if enabled) + cpu_thread::flush_profilers(); + GetCallbacks().on_pause(); // Update pause start time diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 8252d5f0ae..88498b75f8 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -408,6 +408,7 @@ struct cfg_root : cfg::node cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false}; cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled cfg::_bool spu_cache{this, "SPU Cache", true}; + cfg::_bool spu_prof{this, "SPU Profiler", false}; cfg::_enum enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully cfg::_bool spu_accurate_xfloat{this, "Accurate xfloat", false}; cfg::_bool spu_approx_xfloat{this, "Approximate xfloat", true};