From 8c28c4e8ec8c6160945ef2bbc46a40caea445ee6 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 26 Oct 2019 00:52:56 +0300 Subject: [PATCH] SPU: Make recompilers lock-free. --- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 40 +-- rpcs3/Emu/Cell/SPUASMJITRecompiler.h | 2 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 366 +++++++++++++------------ rpcs3/Emu/Cell/SPURecompiler.h | 100 +++---- 4 files changed, 248 insertions(+), 260 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index bb50f8e227..fa0f81056a 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -45,24 +45,23 @@ void spu_recompiler::init() } } -spu_function_t spu_recompiler::compile(const std::vector& func, void* fn_location) +spu_function_t spu_recompiler::compile(std::vector&& _func) { - if (!fn_location) - { - fn_location = m_spurt->find(func); - } + const auto add_loc = m_spurt->add_empty(std::move(_func)); - if (fn_location == spu_runtime::g_dispatcher) - { - return &dispatch; - } - - if (!fn_location) + if (!add_loc) { return nullptr; } - if (auto cache = g_fxo->get(); cache && g_cfg.core.spu_cache) + if (add_loc->compiled) + { + return add_loc->compiled; + } + + const std::vector& func = add_loc->data; + + if (auto cache = g_fxo->get(); cache && g_cfg.core.spu_cache && !add_loc->cached.exchange(1)) { cache->add(func); } @@ -94,10 +93,10 @@ spu_function_t spu_recompiler::compile(const std::vector& func, void* fn_lo X86Assembler compiler(&code); this->c = &compiler; - if (g_cfg.core.spu_debug) + if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) { // Dump analyser data - this->dump(log); + this->dump(func, log); fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); // Set logger @@ -892,12 +891,21 @@ spu_function_t spu_recompiler::compile(const std::vector& func, void* fn_lo LOG_FATAL(SPU, "Failed to build a function"); } - if (!m_spurt->add(fn_location, fn)) + // Install compiled function pointer + const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn); + + // Rebuild trampoline if necessary + if (!m_spurt->rebuild_ubertrampoline(func[1])) { return nullptr; } - if (g_cfg.core.spu_debug) + if (added) + { + add_loc->compiled.notify_all(); + } + + if (g_cfg.core.spu_debug && added) { // Add ASMJIT logs fmt::append(log, "Address: %p\n\n", fn); diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index b639f1aa6c..4848ebfe5d 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -13,7 +13,7 @@ public: virtual void init() override; - virtual spu_function_t compile(const std::vector&, void*) override; + virtual spu_function_t compile(std::vector&&) override; private: // ASMJIT runtime diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index a825c4e8ad..f83bcf98bb 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -400,15 +400,19 @@ void spu_cache::initialize() { compiler->init(); - if (compiler->compile({}, nullptr) && spu_runtime::g_interpreter) + if (compiler->compile({}) && spu_runtime::g_interpreter) { - LOG_SUCCESS(SPU, "SPU Runtime: built interpreter."); + LOG_SUCCESS(SPU, "SPU Runtime: Built the interpreter."); if (g_cfg.core.spu_decoder != spu_decoder_type::llvm) { return; } } + else + { + LOG_FATAL(SPU, "SPU Runtime: Failed to build the interpreter."); + } } } @@ -472,34 +476,21 @@ void spu_cache::initialize() } // Call analyser - const std::vector& func2 = compiler->analyse(ls.data(), func[0]); + std::vector func2 = compiler->analyse(ls.data(), func[0]); if (func2.size() != size0) { LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1); } - if (!compiler->compile(func, nullptr)) + if (!compiler->compile(std::move(func2))) { // Likely, out of JIT memory. Signal to prevent further building. fail_flag |= 1; } // Clear fake LS - for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4) - { - if (se_storage::swap(func2[i]) != ls[pos / 4]) - { - LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed at 0x%x", func2[0], pos); - } - - ls[pos / 4] = 0; - } - - if (func2.size() != size0) - { - std::memset(ls.data(), 0, 0x40000); - } + std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1)); g_progr_pdone++; } @@ -519,7 +510,7 @@ void spu_cache::initialize() if (fail_flag) { - LOG_ERROR(SPU, "SPU Runtime: Cache building failed (too much data). SPU Cache will be disabled."); + LOG_FATAL(SPU, "SPU Runtime: Cache building failed (out of memory)."); return; } @@ -581,9 +572,6 @@ bool spu_runtime::func_compare::operator()(const std::vector& lhs, const st spu_runtime::spu_runtime() { - // Initialize "empty" block - m_map[std::vector()] = tr_dispatch; - // Clear LLVM output m_cache_path = Emu.PPUCache(); @@ -602,60 +590,77 @@ spu_runtime::spu_runtime() } } -bool spu_runtime::add(void* _where, spu_function_t compiled) +spu_item* spu_runtime::add_empty(std::vector&& data) { - writer_lock lock(*this); - - if (!_where) + if (data.size() <= 1) { - return false; + return nullptr; } - // Use opaque pointer - auto& where = *static_cast(_where); + // Store previous item if already added + spu_item* prev = nullptr; - // Function info - const std::vector& func = get_func(_where); - - // - const u32 _off = 1 + (func[0] / 4) * (false); - - // Set pointer to the compiled function - where.second = compiled; - - // Register function in PIC map - m_pic_map[{func.data() + _off, func.size() - _off}] = compiled; - - if (func.size() > 1) + //Try to add item that doesn't exist yet + const auto ret = m_stuff[data[1] >> 12].push_if([&](spu_item& _new, spu_item& _old) { - // Rebuild trampolines if necessary - if (const auto new_tr = rebuild_ubertrampoline(func[1])) - { - g_dispatcher->at(func[1] >> 12) = new_tr; - } - else + std::basic_string_view lhs{_new.data.data() + 1, _new.data.size() - 1}; + std::basic_string_view rhs{_old.data.data() + 1, _old.data.size() - 1}; + + if (lhs == rhs) { + prev = &_old; return false; } + + return true; + }, std::move(data)); + + if (ret) + { + return ret; } - // Notify in lock destructor - lock.notify = true; - return true; + return prev; } spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst) { // Prepare sorted list - m_flat_list.clear(); - { - // Select required subrange (fixed 20 bits for single pos in g_dispatcher table) - const u32 id_lower = id_inst & ~0xfff; - const u32 id_upper = id_inst | 0xfff; + static thread_local std::vector, spu_function_t>> m_flat_list; - m_flat_list.assign(m_pic_map.lower_bound({&id_lower, 1}), m_pic_map.upper_bound({&id_upper, 1})); + // Remember top position + auto stuff_it = m_stuff.at(id_inst >> 12).begin(); + auto stuff_end = m_stuff.at(id_inst >> 12).end(); + { + if (stuff_it->trampoline) + { + return stuff_it->trampoline; + } + + m_flat_list.clear(); + + for (auto it = stuff_it; it != stuff_end; ++it) + { + if (const auto ptr = it->compiled.load()) + { + std::basic_string_view range{it->data.data() + 1, it->data.size() - 1}; + m_flat_list.emplace_back(range, ptr); + } + else + { + // Pull oneself deeper (TODO) + ++stuff_it; + } + } } + std::sort(m_flat_list.begin(), m_flat_list.end(), [&](const auto& a, const auto& b) + { + std::basic_string_view lhs = a.first; + std::basic_string_view rhs = b.first; + return lhs < rhs; + }); + struct work { u32 size; @@ -674,6 +679,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst) const auto _end = m_flat_list.end(); const u32 size0 = ::size32(m_flat_list); + auto result = beg->second; + if (size0 != 1) { // Allocate some writable executable memory @@ -944,77 +951,63 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst) } workload.clear(); - return reinterpret_cast(reinterpret_cast(wxptr)); + result = reinterpret_cast(reinterpret_cast(wxptr)); } - // No trampoline required - return beg->second; -} - -void* spu_runtime::find(const std::vector& func) -{ - writer_lock lock(*this); - - // - const u32 _off = 1 + (func[0] / 4) * (false); - - // Try to find PIC first - const auto found = m_pic_map.find({func.data() + _off, func.size() - _off}); - - if (found != m_pic_map.end()) + if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result)) { - // Wait if already in progress - while (!found->second) + return _old; + } + + // Install ubertrampoline + auto& insert_to = spu_runtime::g_dispatcher->at(id_inst >> 12); + + auto _old = insert_to.load(); + + do + { + // Make sure we are replacing an older ubertrampoline but not newer one + if (_old != tr_dispatch) { - m_cond.wait(m_mutex); + bool ok = false; + + for (auto it = stuff_it; it != stuff_end; ++it) + { + if (it->trampoline == _old) + { + ok = true; + break; + } + } + + if (!ok) + { + return result; + } } - - // Already compiled - return g_dispatcher; } + while (!insert_to.compare_exchange(_old, result)); - // Try to find existing function, register new one if necessary - const auto result = m_map.try_emplace(func, nullptr); - - // Add PIC entry as well - m_pic_map.try_emplace({result.first->first.data() + _off, result.first->first.size() - _off}, nullptr); - - // Pointer to the value in the map (pair) - const auto fn_location = &*result.first; - - if (fn_location->second) - { - // Already compiled - return g_dispatcher; - } - else if (!result.second) - { - // Wait if already in progress - while (!fn_location->second) - { - m_cond.wait(m_mutex); - } - - return g_dispatcher; - } - - // Return location to compile and use in add() - return fn_location; + return result; } spu_function_t spu_runtime::find(const u32* ls, u32 addr) const { - reader_lock lock(this->m_mutex); - - const auto upper = m_pic_map.upper_bound({ls + addr / 4, (0x40000 - addr) / 4}); - - if (upper != m_pic_map.begin()) + for (auto& item : m_stuff.at(ls[addr / 4] >> 12)) { - const auto found = std::prev(upper); - - if (found->first.compare(0, found->first.size(), ls + addr / 4, found->first.size()) == 0) + if (const auto ptr = item.compiled.load()) { - return found->second; + std::basic_string_view range{item.data.data() + 1, item.data.size() - 1}; + + if (addr / 4 + range.size() > 0x10000) + { + continue; + } + + if (range.compare(0, range.size(), ls + addr / 4, range.size()) == 0) + { + return ptr; + } } } @@ -1055,18 +1048,12 @@ spu_function_t spu_runtime::make_branch_patchpoint() const spu_recompiler_base::spu_recompiler_base() { - result.reserve(8192); } spu_recompiler_base::~spu_recompiler_base() { } -void spu_recompiler_base::make_function(const std::vector& data) -{ - compile(data, nullptr); -} - void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) { // If code verification failed from a patched patchpoint, clear it with a dispatcher jump @@ -1082,7 +1069,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) bytes[0] = 0xe9; // jmp rel32 std::memcpy(bytes + 1, &rel, 4); - bytes[5] = 0x90; + bytes[5] = 0x66; // lnop (2 bytes) bytes[6] = 0x90; bytes[7] = 0x90; @@ -1096,8 +1083,17 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) return; } + spu.jit->init(); + // Compile - spu.jit->make_function(spu.jit->analyse(spu._ptr(0), spu.pc)); + const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr(0), spu.pc)); + + if (!func) + { + LOG_FATAL(SPU, "[0x%05x] Compilation failed.", spu.pc); + Emu.Pause(); + return; + } // Diagnostic if (g_cfg.core.spu_block_size == spu_block_size_type::giga) @@ -1109,6 +1105,8 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) LOG_TRACE(SPU, "Called from 0x%x", _info._u32[2] - 4); } } + + spu_runtime::g_tail_escape(&spu, func, nullptr); } void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip) @@ -1156,6 +1154,8 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip) } atomic_storage::release(*reinterpret_cast(rip), result); + + spu_runtime::g_tail_escape(&spu, func, rip); } void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* rip) try @@ -1189,10 +1189,11 @@ catch (const std::exception& e) LOG_NOTICE(GENERAL, "\n%s", spu.dump()); } -const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) +std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) { // Result: addr + raw instruction data - result.clear(); + std::vector result; + result.reserve(10000); result.push_back(entry_point); // Initialize block entries @@ -3120,7 +3121,7 @@ const std::vector& spu_recompiler_base::analyse(const be_t* ls, u32 en return result; } -void spu_recompiler_base::dump(std::string& out) +void spu_recompiler_base::dump(const std::vector& result, std::string& out) { SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); dis_asm.offset = reinterpret_cast(result.data() + 1); @@ -4153,31 +4154,25 @@ public: } } - virtual spu_function_t compile(const std::vector& func, void* fn_location) override + virtual spu_function_t compile(std::vector&& _func) override { - if (func.empty() && m_interp_magn) + if (_func.empty() && m_interp_magn) { return compile_interpreter(); } - if (!fn_location) - { - fn_location = m_spurt->find(func); - } + const auto add_loc = m_spurt->add_empty(std::move(_func)); - if (fn_location == spu_runtime::g_dispatcher) - { - return &dispatch; - } - - if (!fn_location) + if (!add_loc) { return nullptr; } + const std::vector& func = add_loc->data; + std::string log; - if (auto cache = g_fxo->get(); cache && g_cfg.core.spu_cache) + if (auto cache = g_fxo->get(); cache && g_cfg.core.spu_cache && !add_loc->cached.exchange(1)) { cache->add(func); } @@ -4206,9 +4201,9 @@ public: const u32 start = m_pos; const u32 end = start + m_size; - if (g_cfg.core.spu_debug) + if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) { - this->dump(log); + this->dump(func, log); fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); } @@ -4745,11 +4740,17 @@ public: // Register function pointer const spu_function_t fn = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); - if (!m_spurt->add(fn_location, fn)) + // Install unconditionally, possibly replacing existing one from spu_fast + add_loc->compiled = fn; + + // Rebuild trampoline if necessary + if (!m_spurt->rebuild_ubertrampoline(func[1])) { return nullptr; } + add_loc->compiled.notify_all(); + if (g_cfg.core.spu_debug) { out.flush(); @@ -8236,7 +8237,7 @@ std::unique_ptr spu_recompiler_base::make_llvm_recompiler(u struct spu_llvm { // Workload - lf_queue> registered; + lf_queue registered; void operator()() { @@ -8259,7 +8260,10 @@ struct spu_llvm continue; } - const std::vector& func = spu_runtime::get_func(parg->first); + const std::vector& func = (*parg)->data; + + // Old function pointer (pre-recompiled) + const spu_function_t _old = (*parg)->compiled; // Get data start const u32 start = func[0]; @@ -8272,17 +8276,17 @@ struct spu_llvm } // Call analyser - const std::vector& func2 = compiler->analyse(ls.data(), func[0]); + std::vector func2 = compiler->analyse(ls.data(), func[0]); if (func2.size() != size0) { LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1); } - if (const auto target = compiler->compile(func, parg->first)) + if (const auto target = compiler->compile(std::move(func2))) { // Redirect old function (TODO: patch in multiple places) - const s64 rel = reinterpret_cast(target) - reinterpret_cast(parg->second) - 5; + const s64 rel = reinterpret_cast(target) - reinterpret_cast(_old) - 5; union { @@ -8296,28 +8300,17 @@ struct spu_llvm bytes[6] = 0x90; bytes[7] = 0x90; - atomic_storage::release(*reinterpret_cast(parg->second), result); + atomic_storage::release(*reinterpret_cast(_old), result); } else { - LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func2[0]); + LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func[0]); + Emu.Pause(); + return; } // Clear fake LS - for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4) - { - if (se_storage::swap(func2[i]) != ls[pos / 4]) - { - LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed at 0x%x", func2[0], pos); - } - - ls[pos / 4] = 0; - } - - if (func2.size() != size0) - { - std::memset(ls.data(), 0, 0x40000); - } + std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1)); } } @@ -8336,27 +8329,26 @@ struct spu_fast : public spu_recompiler_base } } - virtual spu_function_t compile(const std::vector& func, void* fn_location) override + virtual spu_function_t compile(std::vector&& _func) override { - if (!fn_location) - { - fn_location = m_spurt->find(func); - } + const auto add_loc = m_spurt->add_empty(std::move(_func)); - if (fn_location == spu_runtime::g_dispatcher) - { - return &dispatch; - } - - if (!fn_location) + if (!add_loc) { return nullptr; } - if (g_cfg.core.spu_debug) + if (add_loc->compiled) + { + return add_loc->compiled; + } + + const std::vector& func = add_loc->data; + + if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) { std::string log; - this->dump(log); + this->dump(func, log); fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); } @@ -8639,15 +8631,29 @@ struct spu_fast : public spu_recompiler_base *raw++ = 0x28; *raw++ = 0xc3; - if (!m_spurt->add(fn_location, reinterpret_cast(result))) + const auto fn = reinterpret_cast(result); + + // Install pointer carefully + const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn); + + if (added) + { + // Send work to LLVM compiler thread + g_fxo->get()->registered.push(add_loc); + } + + // Rebuild trampoline if necessary + if (!m_spurt->rebuild_ubertrampoline(func[1])) { return nullptr; } - // Send work to LLVM compiler thread; after add() to avoid race - g_fxo->get()->registered.push(fn_location, result); + if (added) + { + add_loc->compiled.notify_all(); + } - return reinterpret_cast(result); + return fn; } }; diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index c4b5e5d77b..d824170ba2 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -1,9 +1,8 @@ #pragma once #include "Utilities/File.h" -#include "Utilities/mutex.h" -#include "Utilities/cond.h" #include "Utilities/JIT.h" +#include "Utilities/lockless.h" #include "SPUThread.h" #include #include @@ -37,33 +36,47 @@ public: static void initialize(); }; +class spu_item +{ +public: + // SPU program + const std::vector data; + + // Compiled function pointer + atomic_t compiled = nullptr; + + // Ubertrampoline generated for this item when it was latest + atomic_t trampoline = nullptr; + + atomic_t cached = false; + atomic_t logged = false; + + spu_item(std::vector&& data) + : data(std::move(data)) + { + } + + spu_item(const spu_item&) = delete; + + spu_item& operator=(const spu_item&) = delete; +}; + // Helper class class spu_runtime { - mutable shared_mutex m_mutex; - - mutable cond_variable m_cond; - struct func_compare { // Comparison function for SPU programs bool operator()(const std::vector& lhs, const std::vector& rhs) const; }; - // All functions - std::map, spu_function_t, func_compare> m_map; - - // All functions as PIC - std::map, spu_function_t> m_pic_map; + // All functions (2^20 bunches) + std::array, (1 << 20)> m_stuff; // Debug module output location std::string m_cache_path; - // Scratch vector - std::vector, spu_function_t>> m_flat_list; - public: - // Trampoline to spu_recompiler_base::dispatch static const spu_function_t tr_dispatch; @@ -88,23 +101,15 @@ public: return m_cache_path; } - // Add compiled function and generate trampoline if necessary - bool add(void* where, spu_function_t compiled); - -private: + // Rebuild ubertrampoline for given identifier (first instruction) spu_function_t rebuild_ubertrampoline(u32 id_inst); +private: friend class spu_cache; + public: - - // Return opaque pointer for add() - void* find(const std::vector&); - - // Get func from opaque ptr - static inline const std::vector& get_func(void* _where) - { - return static_cast(_where)->first; - } + // Return new pointer for add() + spu_item* add_empty(std::vector&&); // Find existing function spu_function_t find(const u32* ls, u32 addr) const; @@ -129,31 +134,6 @@ public: // Interpreter entry point static spu_function_t g_interpreter; - - // Exclusive lock - struct writer_lock - { - spu_runtime& _this; - bool notify = false; - - writer_lock(const writer_lock&) = delete; - - writer_lock(spu_runtime& _this) - : _this(_this) - { - _this.m_mutex.lock(); - } - - ~writer_lock() - { - _this.m_mutex.unlock(); - - if (notify) - { - _this.m_cond.notify_all(); - } - } - }; }; // SPU Recompiler instance base class @@ -303,9 +283,6 @@ private: // For private use std::vector workload; - // Result of analyse(), to avoid copying and allocation - std::vector result; - public: spu_recompiler_base(); @@ -314,11 +291,8 @@ public: // Initialize virtual void init() = 0; - // Compile function (may fail) - virtual spu_function_t compile(const std::vector&, void*) = 0; - - // Compile function, handle failure - void make_function(const std::vector&); + // Compile function + virtual spu_function_t compile(std::vector&&) = 0; // Default dispatch function fallback (second arg is unused) static void dispatch(spu_thread&, void*, u8* rip); @@ -330,10 +304,10 @@ public: static void old_interpreter(spu_thread&, void* ls, u8*); // Get the function data at specified address - const std::vector& analyse(const be_t* ls, u32 lsa); + std::vector analyse(const be_t* ls, u32 lsa); // Print analyser internal state - void dump(std::string& out); + void dump(const std::vector& result, std::string& out); // Get SPU Runtime spu_runtime& get_runtime()