diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 81c70f11d0..6abcb3d0ff 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -31,12 +31,6 @@ std::unique_ptr spu_recompiler_base::make_asmjit_recompiler spu_runtime::spu_runtime() { - if (g_cfg.core.spu_debug) - { - fs::file log(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite); - log.write(fmt::format("SPU JIT Log...\n\nTitle: %s\nTitle ID: %s\n\n", Emu.GetTitle().c_str(), Emu.GetTitleID().c_str())); - } - LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized..."); // Initialize lookup table @@ -51,8 +45,23 @@ spu_runtime::spu_runtime() spu_recompiler::spu_recompiler(SPUThread& spu) : spu_recompiler_base(spu) - , m_rt(std::make_shared()) { + if (!g_cfg.core.spu_shared_runtime) + { + m_spurt = std::make_shared(); + } +} + +spu_function_t spu_recompiler::get(u32 lsa) +{ + // Initialize if necessary + if (!m_spurt) + { + m_spurt = fxm::get_always(); + } + + // Simple atomic read + return m_spurt->m_dispatcher[lsa / 4]; } spu_function_t spu_recompiler::compile(const std::vector& func) @@ -63,6 +72,24 @@ spu_function_t spu_recompiler::compile(const std::vector& func) m_spurt = fxm::get_always(); } + // Don't lock without shared runtime + std::unique_lock lock(m_spurt->m_mutex, std::defer_lock); + + if (g_cfg.core.spu_shared_runtime) + { + lock.lock(); + } + + // Try to find existing function + { + const auto found = m_spurt->m_map.find(func); + + if (found != m_spurt->m_map.end() && found->second) + { + return found->second; + } + } + using namespace asmjit; SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); @@ -78,8 +105,9 @@ spu_function_t spu_recompiler::compile(const std::vector& func) fmt::append(log, "========== SPU BLOCK 0x%05x (size %u) ==========\n\n", func[0], func.size() - 1); } - asmjit::CodeHolder code; - code.init(m_rt->getCodeInfo()); + CodeHolder code; + code.init(m_spurt->m_jitrt.getCodeInfo()); + code._globalHints = asmjit::CodeEmitter::kHintOptimizedAlign; X86Assembler compiler(&code); this->c = &compiler; @@ -626,7 +654,7 @@ spu_function_t spu_recompiler::compile(const std::vector& func) c->align(kAlignCode, 16); c->bind(label_diff); c->inc(SPU_OFF_64(block_failure)); - c->jmp(asmjit::imm_ptr(&spu_recompiler_base::dispatch)); + c->jmp(imm_ptr(&spu_recompiler_base::dispatch)); for (auto&& work : decltype(after)(std::move(after))) { @@ -648,15 +676,228 @@ spu_function_t spu_recompiler::compile(const std::vector& func) // Compile and get function address spu_function_t fn; - if (m_rt->add(&fn, &code)) + if (m_spurt->m_jitrt.add(&fn, &code)) { LOG_FATAL(SPU, "Failed to build a function"); } + // Register function + m_spurt->m_map[func] = fn; + + // Generate a dispatcher (übertrampoline) + std::vector addrv{func[0]}; + const auto beg = m_spurt->m_map.lower_bound(addrv); + addrv[0] += 4; + const auto end = m_spurt->m_map.lower_bound(addrv); + const u32 size0 = std::distance(beg, end); + + if (size0 == 1) + { + m_spurt->m_dispatcher[func[0] / 4] = fn; + } + else + { + CodeHolder code; + code.init(m_spurt->m_jitrt.getCodeInfo()); + + X86Assembler compiler(&code); + this->c = &compiler; + + if (g_cfg.core.spu_debug) + { + // Set logger + code.setLogger(&logger); + } + + compiler.comment("\n\nTrampoline:\n\n"); + + struct work + { + u32 size; + u32 level; + Label label; + std::map, spu_function_t>::iterator beg; + std::map, spu_function_t>::iterator end; + }; + + std::vector workload; + workload.reserve(size0); + workload.emplace_back(); + workload.back().size = size0; + workload.back().level = 1; + workload.back().beg = beg; + workload.back().end = end; + + for (std::size_t i = 0; i < workload.size(); i++) + { + // Get copy of the workload info + work w = workload[i]; + + // Split range in two parts + auto it = w.beg; + auto it2 = w.beg; + u32 size1 = w.size / 2; + u32 size2 = w.size - size1; + std::advance(it2, w.size / 2); + + while (true) + { + it = it2; + size1 = w.size - size2; + + // Adjust ranges (forward) + while (it != w.end && w.beg->first.at(w.level) == it->first.at(w.level)) + { + it++; + size1++; + } + + if (it == w.end) + { + // Cannot split: words are identical within the range at this level + w.level++; + } + else + { + size2 = w.size - size1; + break; + } + } + + // Value for comparison + const u32 x = it->first.at(w.level); + + // Adjust ranges (backward) + while (true) + { + it--; + + if (it->first.at(w.level) != x) + { + it++; + break; + } + + verify(HERE), it != w.beg; + size1--; + size2++; + } + + if (w.label.isValid()) + { + c->align(kAlignCode, 16); + c->bind(w.label); + } + + c->cmp(x86::dword_ptr(*ls, func[0] + (w.level - 1) * 4), x); + + // Low subrange target label + Label label_below; + + if (size1 == 1) + { + label_below = c->newLabel(); + c->jb(label_below); + } + else + { + workload.push_back(w); + workload.back().end = it; + workload.back().size = size1; + workload.back().label = c->newLabel(); + c->jb(workload.back().label); + } + + // Second subrange target + const auto target = it->second ? it->second : &dispatch; + + if (size2 == 1) + { + c->jmp(imm_ptr(target)); + } + else + { + it2 = it; + + // Select additional midrange for equality comparison + while (it2 != w.end && it2->first.at(w.level) == x) + { + size2--; + it2++; + } + + if (it2 != w.end) + { + // High subrange target label + Label label_above; + + if (size2 == 1) + { + label_above = c->newLabel(); + c->ja(label_above); + } + else + { + workload.push_back(w); + workload.back().beg = it2; + workload.back().size = size2; + workload.back().label = c->newLabel(); + c->ja(workload.back().label); + } + + const u32 size3 = w.size - size1 - size2; + + if (size3 == 1) + { + c->jmp(imm_ptr(target)); + } + else + { + workload.push_back(w); + workload.back().beg = it; + workload.back().end = it2; + workload.back().size = size3; + workload.back().label = c->newLabel(); + c->jmp(workload.back().label); + } + + if (label_above.isValid()) + { + c->bind(label_above); + c->jmp(imm_ptr(it2->second ? it2->second : &dispatch)); + } + } + else + { + workload.push_back(w); + workload.back().beg = it; + workload.back().size = w.size - size1; + workload.back().label = c->newLabel(); + c->jmp(workload.back().label); + } + } + + if (label_below.isValid()) + { + c->bind(label_below); + c->jmp(imm_ptr(w.beg->second ? w.beg->second : &dispatch)); + } + } + + spu_function_t tr; + + if (m_spurt->m_jitrt.add(&tr, &code)) + { + LOG_FATAL(SPU, "Failed to build a trampoline"); + } + + m_spurt->m_dispatcher[func[0] / 4] = tr; + } + if (g_cfg.core.spu_debug) { // Add ASMJIT logs - fmt::append(log, "{%s} Address: %p\n\n", m_spu.get_name(), fn); + fmt::append(log, "Address: %p (%p)\n\n", fn, +m_spurt->m_dispatcher[func[0] / 4]); log += logger.getString(); log += "\n\n\n"; @@ -731,25 +972,24 @@ void spu_recompiler::branch_fixed(u32 target) Label patch_point = c->newLabel(); c->lea(*qw0, x86::qword_ptr(patch_point)); c->mov(SPU_OFF_32(pc), target); - c->align(kAlignCode, 16); + + // Need to emit exactly one executable instruction within 8 bytes + c->align(kAlignCode, 8); c->bind(patch_point); - const auto result = m_spu.jit_map.emplace(block(m_spu, target), nullptr); + const auto result = m_spurt->m_map.emplace(block(m_spu, target), nullptr); if (result.second || !result.first->second) { if (result.first->first.size()) { // Target block hasn't been compiled yet, record overwriting position - c->mov(*ls, imm_ptr(&*result.first)); c->jmp(imm_ptr(&spu_recompiler_base::branch)); } else { - // SPURS Workload entry point or similar thing - c->mov(x86::r10, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher) + target * 2)); - c->xor_(qw0->r32(), qw0->r32()); - c->jmp(x86::r10); + // SPURS Workload entry point or similar thing (emit 8-byte NOP) + c->dq(0x841f0f); } } else @@ -757,7 +997,14 @@ void spu_recompiler::branch_fixed(u32 target) c->jmp(imm_ptr(result.first->second)); } - c->align(kAlignCode, 16); + // Branch via dispatcher (occupies 16 bytes including padding) + c->align(kAlignCode, 8); + c->mov(x86::rax, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher) + target * 2)); + c->xor_(qw0->r32(), qw0->r32()); + c->jmp(x86::rax); + c->align(kAlignCode, 8); + c->dq(reinterpret_cast(&*result.first)); + c->dq(reinterpret_cast(result.first->second)); } void spu_recompiler::branch_indirect(spu_opcode_t op) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index 330dac9798..42b79de6a8 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -32,13 +32,13 @@ public: // SPU ASMJIT Recompiler class spu_recompiler : public spu_recompiler_base { - const std::shared_ptr m_rt; - std::shared_ptr m_spurt; public: spu_recompiler(class SPUThread& spu); + virtual spu_function_t get(u32 lsa) override; + virtual spu_function_t compile(const std::vector& func) override; private: diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 72c54c6257..8908bd5ec1 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -16,9 +16,6 @@ spu_recompiler_base::spu_recompiler_base(SPUThread& spu) { // Initialize lookup table spu.jit_dispatcher.fill(&dispatch); - - // Initialize "empty" block - spu.jit_map[std::vector()] = &dispatch; } spu_recompiler_base::~spu_recompiler_base() @@ -27,73 +24,89 @@ spu_recompiler_base::~spu_recompiler_base() void spu_recompiler_base::dispatch(SPUThread& spu, void*, u8* rip) { - const auto result = spu.jit_map.emplace(block(spu, spu.pc), nullptr); - - if (result.second || !result.first->second) + // If check failed after direct branch, patch it with single NOP + if (rip) { - result.first->second = spu.jit->compile(result.first->first); +#ifdef _MSC_VER + *(volatile u64*)(rip) = 0x841f0f; +#else + __atomic_store_n(reinterpret_cast(rip), 0x841f0f, __ATOMIC_RELAXED); +#endif } - spu.jit_dispatcher[spu.pc / 4] = result.first->second; + const auto func = spu.jit->get(spu.pc); + + // First attempt (load new trampoline and retry) + if (func != spu.jit_dispatcher[spu.pc / 4]) + { + spu.jit_dispatcher[spu.pc / 4] = func; + return; + } + + // Second attempt (recover from the recursion after repeated unsuccessful trampoline call) + if (spu.block_counter != spu.block_recover && func != &dispatch) + { + spu.block_recover = spu.block_counter; + return; + } + + // Compile + verify(HERE), spu.jit->compile(block(spu, spu.pc)); + spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc); } -void spu_recompiler_base::branch(SPUThread& spu, std::pair, spu_function_t>* pair, u8* rip) +void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip) { + const auto pair = *reinterpret_cast, spu_function_t>**>(rip + 24); + spu.pc = pair->first[0]; - if (!pair->second) - { - pair->second = spu.jit->compile(pair->first); - } + const auto func = pair->second ? pair->second : spu.jit->compile(pair->first); - spu.jit_dispatcher[spu.pc / 4] = pair->second; + verify(HERE), func, pair->second == func; + + // Overwrite function address + reinterpret_cast*>(rip + 32)->store(func); // Overwrite jump to this function with jump to the compiled function - const s64 rel = reinterpret_cast(pair->second) - reinterpret_cast(rip) - 5; + const s64 rel = reinterpret_cast(func) - reinterpret_cast(rip) - 5; + + alignas(8) u8 bytes[8]; if (rel >= INT32_MIN && rel <= INT32_MAX) { const s64 rel8 = (rel + 5) - 2; - alignas(8) u8 bytes[8]; - if (rel8 >= INT8_MIN && rel8 <= INT8_MAX) { bytes[0] = 0xeb; // jmp rel8 bytes[1] = static_cast(rel8); - std::memset(bytes + 2, 0x90, 5); - bytes[7] = 0x48; + std::memset(bytes + 2, 0x90, 6); } else { bytes[0] = 0xe9; // jmp rel32 std::memcpy(bytes + 1, &rel, 4); - std::memset(bytes + 5, 0x90, 2); - bytes[7] = 0x48; + std::memset(bytes + 5, 0x90, 3); } - -#ifdef _MSC_VER - *(volatile u64*)(rip) = *reinterpret_cast(+bytes); -#else - __atomic_store_n(reinterpret_cast(rip), *reinterpret_cast(+bytes), __ATOMIC_RELAXED); -#endif } else { - alignas(16) u8 bytes[16]; - - bytes[0] = 0xff; // jmp [rip+2] + bytes[0] = 0xff; // jmp [rip+26] bytes[1] = 0x25; - bytes[2] = 0x02; + bytes[2] = 0x1a; bytes[3] = 0x00; bytes[4] = 0x00; bytes[5] = 0x00; - bytes[6] = 0x48; // mov rax, imm64 (not executed) - bytes[7] = 0xb8; - std::memcpy(bytes + 8, &pair->second, 8); - - reinterpret_cast*>(rip)->store(*reinterpret_cast(+bytes)); + bytes[6] = 0x90; + bytes[7] = 0x90; } + +#ifdef _MSC_VER + *(volatile u64*)(rip) = *reinterpret_cast(+bytes); +#else + __atomic_store_n(reinterpret_cast(rip), *reinterpret_cast(+bytes), __ATOMIC_RELAXED); +#endif } std::vector spu_recompiler_base::block(SPUThread& spu, u32 lsa) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index b687c4645b..f87fe92e6b 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -15,6 +15,9 @@ public: virtual ~spu_recompiler_base(); + // Get pointer to the trampoline at given position + virtual spu_function_t get(u32 lsa) = 0; + // Compile function virtual spu_function_t compile(const std::vector& func) = 0; @@ -22,7 +25,7 @@ public: static void dispatch(SPUThread&, void*, u8*); // Direct branch fallback for non-compiled destination - static void branch(SPUThread&, std::pair, spu_function_t>*, u8* rip); + static void branch(SPUThread&, void*, u8*); // Get the block at specified address static std::vector block(SPUThread&, u32 lsa); diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index deb0d6b9d9..a2c45b4a54 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -594,8 +594,6 @@ public: std::unique_ptr jit; // Recompiler instance - std::map, spu_function_t> jit_map; // All compiled blocks (first u32 is addr) - u64 block_counter = 0; u64 block_recover = 0; u64 block_failure = 0; diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 55242214a1..7d53db7945 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -841,6 +841,12 @@ void Emulator::Load(bool add_only) LOG_NOTICE(LOADER, "Elf path: %s", argv[0]); } + if (g_cfg.core.spu_debug) + { + fs::file log(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite); + log.write(fmt::format("SPU JIT Log\n\nTitle: %s\nTitle ID: %s\n\n", Emu.GetTitle(), Emu.GetTitleID())); + } + ppu_load_exec(ppu_exec); fxm::import(Emu.GetCallbacks().get_gs_render); // TODO: must be created in appropriate sys_rsx syscall diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 59fb01fa70..9c046470f7 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -302,6 +302,7 @@ struct cfg_root : cfg::node cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield + cfg::_bool spu_shared_runtime{this, "SPU Shared Runtime", true}; // Share compiled SPU functions between all threads cfg::_enum lib_loading{this, "Lib Loader", lib_loading_type::liblv2only}; cfg::_bool hook_functions{this, "Hook static functions"};