diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 07527f7ca9..9e83fa0e5a 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7,6 +7,7 @@ #include "Utilities/StrUtil.h" #include "Utilities/JIT.h" #include "Utilities/sysinfo.h" +#include "util/init_mutex.hpp" #include "SPUThread.h" #include "SPUAnalyser.h" @@ -8266,7 +8267,7 @@ std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u struct spu_llvm { // Workload - lf_queue registered; + lf_queue> registered; void operator()() { @@ -8277,22 +8278,96 @@ struct spu_llvm // Fake LS std::vector> ls(0x10000); - for (auto* parg : registered) + // To compile (hash -> item) + std::unordered_multimap> enqueued; + + // Mini-profiler (hash -> number of occurrences) + std::unordered_map, value_hash> samples; + + // For synchronization with profiler thread + stx::init_mutex prof_mutex; + + named_thread profiler("SPU LLVM Profiler"sv, [&]() { - if (thread_ctrl::state() == thread_state::aborting) + while (thread_ctrl::state() != thread_state::aborting) { - break; + { + // Lock if enabled + const auto lock = prof_mutex.access(); + + if (!lock) + { + // Wait when the profiler is disabled + prof_mutex.wait_for_initialized(); + continue; + } + + // Collect profiling samples + idm::select>([&](u32 id, spu_thread& spu) + { + const u64 name = atomic_storage::load(spu.block_hash); + + if (!(spu.state.load() & (cpu_flag::wait + cpu_flag::stop + cpu_flag::dbg_global_pause))) + { + const auto found = std::as_const(samples).find(spu.block_hash); + + if (found != std::as_const(samples).end()) + { + const_cast&>(found->second)++; + } + } + }); + } + + // Sleep for a short period if enabled + thread_ctrl::wait_for(20, false); + } + }); + + while (thread_ctrl::state() != thread_state::aborting) + { + for (const auto& pair : registered.pop_all()) + { + enqueued.emplace(pair); + + // Interrupt and kick profiler thread + const auto lock = prof_mutex.init_always([&]{}); + + // Register new blocks to collect samples + samples.emplace(pair.first, 0); } - if (!parg) + if (enqueued.empty()) { + // Interrupt profiler thread and put it to sleep + static_cast(prof_mutex.reset()); + registered.wait(); continue; } - const std::vector& func = (*parg)->data; + // Find the most used enqueued item + u64 sample_max = 0; + auto found_it = enqueued.begin(); + + for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it) + { + const u64 cur = std::as_const(samples).at(it->first); + + if (cur > sample_max) + { + sample_max = cur; + found_it = it; + } + } + + // Start compiling + const std::vector& func = found_it->second->data; // Old function pointer (pre-recompiled) - const spu_function_t _old = (*parg)->compiled; + const spu_function_t _old = found_it->second->compiled; + + // Remove item from the queue + enqueued.erase(found_it); // Get data start const u32 start = func[0]; @@ -8381,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base } // Allocate executable area with necessary size - const auto result = jit_runtime::alloc(16 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16); + const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16); if (!result) { @@ -8391,18 +8466,42 @@ struct spu_fast : public spu_recompiler_base m_pos = func[0]; m_size = (::size32(func) - 1) * 4; + { + sha1_context ctx; + u8 output[20]; + + sha1_starts(&ctx); + sha1_update(&ctx, reinterpret_cast(func.data() + 1), func.size() * 4 - 4); + sha1_finish(&ctx, output); + + be_t hash_start; + std::memcpy(&hash_start, output, sizeof(hash_start)); + m_hash_start = hash_start; + } + u8* raw = result; - // 8-byte intruction for patching - // Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xffff + // 8-byte intruction for patching (long NOP) + *raw++ = 0x0f; + *raw++ = 0x1f; + *raw++ = 0x84; + *raw++ = 0; + *raw++ = 0; + *raw++ = 0; + *raw++ = 0; + *raw++ = 0; + + // mov rax, m_hash_start + *raw++ = 0x48; + *raw++ = 0xb8; + std::memcpy(raw, &m_hash_start, sizeof(m_hash_start)); + raw += 8; + + // Update block_hash: mov [r13 + spu_thread::m_block_hash], rax *raw++ = 0x49; - *raw++ = 0xc7; + *raw++ = 0x89; *raw++ = 0x45; *raw++ = ::narrow(::offset32(&spu_thread::block_hash)); - *raw++ = 0xff; - *raw++ = 0xff; - *raw++ = 0x00; - *raw++ = 0x00; // Load PC: mov eax, [r13 + spu_thread::pc] *raw++ = 0x41; @@ -8445,16 +8544,6 @@ struct spu_fast : public spu_recompiler_base // trap //*raw++ = 0xcc; - // Update block_hash: mov [r13 + spu_thread::m_block_hash], 0xfffe - *raw++ = 0x49; - *raw++ = 0xc7; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::block_hash)); - *raw++ = 0xfe; - *raw++ = 0xff; - *raw++ = 0x00; - *raw++ = 0x00; - // Secondary prologue: sub rsp,0x28 *raw++ = 0x48; *raw++ = 0x83; @@ -8667,7 +8756,7 @@ struct spu_fast : public spu_recompiler_base if (added) { // Send work to LLVM compiler thread - g_fxo->get()->registered.push(add_loc); + g_fxo->get()->registered.push(m_hash_start, add_loc); } // Rebuild trampoline if necessary diff --git a/rpcs3/util/init_mutex.hpp b/rpcs3/util/init_mutex.hpp index f67a21ba39..f6408d3031 100644 --- a/rpcs3/util/init_mutex.hpp +++ b/rpcs3/util/init_mutex.hpp @@ -262,5 +262,18 @@ namespace stx { return (m_state & c_init_bit) != 0; } + + // Wait for access() + void wait_for_initialized() const noexcept + { + const u32 state = m_state; + + if (state & c_init_bit) + { + return; + } + + m_state.wait(state); + } }; }